From f7c8e7c14f65508b47d0b7881a5240c0fa466ab0 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 31 May 2023 14:35:10 +0100 Subject: [PATCH 01/12] Add draft for lora text encoder scale --- src/diffusers/loaders.py | 9 ++++++++- .../stable_diffusion/pipeline_stable_diffusion.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 84e6b4e61f0f..b8b93b474563 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -839,6 +839,9 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di weight_name = kwargs.pop("weight_name", None) use_safetensors = kwargs.pop("use_safetensors", None) + # set lora scale to a reasonable default + self._scale = 1 + if use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" @@ -933,6 +936,10 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`." warnings.warn(warn_message) + @property + def lora_scale(self): + return self._scale + @property def text_encoder_lora_attn_procs(self): if hasattr(self, "_text_encoder_lora_attn_procs"): @@ -958,7 +965,7 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): old_forward = module.forward def new_forward(x): - return old_forward(x) + lora_layer(x) + return old_forward(x) + self.lora_scale() * lora_layer(x) # Monkey-patch. module.forward = new_forward diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 170002b2514e..6883d7b567ba 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -294,6 +294,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -319,6 +320,10 @@ def _encode_prompt( weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. """ + # set lora scale so that text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): From b7845987c582bdbb4970baba471df52455ead982 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 5 Jun 2023 14:20:41 +0000 Subject: [PATCH 02/12] Improve naming --- src/diffusers/loaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 955ee6ea20c2..5443d57ecec4 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -849,7 +849,7 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di use_safetensors = kwargs.pop("use_safetensors", None) # set lora scale to a reasonable default - self._scale = 1 + self._lora_scale = 1 if use_safetensors and not is_safetensors_available(): raise ValueError( @@ -954,7 +954,7 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di @property def lora_scale(self): - return self._scale + return self._lora_scale @property def text_encoder_lora_attn_procs(self): From d1ad30a7336a1b80a354269a715b75952ded5386 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 6 Jun 2023 21:51:29 +0530 Subject: [PATCH 03/12] fix: training dreambooth lora script. --- examples/dreambooth/train_dreambooth_lora.py | 3 +++ tests/models/test_lora_layers.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 3accc4265787..00dcbd5eb371 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -870,6 +870,9 @@ def main(args): temp_pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, text_encoder=text_encoder ) + # Setting the `_lora_scale` explicitly because we are not using + # `load_lora_weights()`. + temp_pipeline._lora_scale = 1.0 temp_pipeline._modify_text_encoder(text_lora_attn_procs) text_encoder = temp_pipeline.text_encoder del temp_pipeline diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index c5c26494898c..949c26ef2a1c 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -292,7 +292,8 @@ def test_text_encoder_lora_monkey_patch(self): del text_attn_procs gc.collect() - # set `_lora_scale` explicitly as we're not using `load_lora_weights()` here. + # set `_lora_scale` explicitly as we're not directly calling the + # pipeline here. pipe._lora_scale = 1.0 # inference with lora @@ -343,7 +344,8 @@ def test_text_encoder_lora_remove_monkey_patch(self): del text_attn_procs gc.collect() - # set `_lora_scale` explicitly as we're not using `load_lora_weights()` here. + # set `_lora_scale` explicitly as we're not directly calling the + # pipeline here. pipe._lora_scale = 1.0 # inference with lora From 24a4922fe3afd2f23e749eae02b2de520a4588fb Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:29:25 +0100 Subject: [PATCH 04/12] Apply suggestions from code review --- src/diffusers/loaders.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index cc3ad980edb0..f2733bdb051b 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -958,7 +958,9 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di @property def lora_scale(self): - return self._lora_scale + # property function that returns the lora scale which can be set at run time by the pipeline. + # if _lora_scale has not been set, return 1 + return self._lora_scale if hasattr(self, "_lora_scale") else 1 @property def text_encoder_lora_attn_procs(self): From 2f7dc905512ee73522be28881682d622a6bcd7ae Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:29:46 +0100 Subject: [PATCH 05/12] Update examples/dreambooth/train_dreambooth_lora.py --- examples/dreambooth/train_dreambooth_lora.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 00dcbd5eb371..3accc4265787 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -870,9 +870,6 @@ def main(args): temp_pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, text_encoder=text_encoder ) - # Setting the `_lora_scale` explicitly because we are not using - # `load_lora_weights()`. - temp_pipeline._lora_scale = 1.0 temp_pipeline._modify_text_encoder(text_lora_attn_procs) text_encoder = temp_pipeline.text_encoder del temp_pipeline From c0c608802defef4bb1181aec584c442341e7dcc0 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:30:29 +0100 Subject: [PATCH 06/12] Apply suggestions from code review --- src/diffusers/loaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index f2733bdb051b..3c8c3e0bb2ef 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -957,10 +957,10 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di warnings.warn(warn_message) @property - def lora_scale(self): + def lora_scale(self) -> float: # property function that returns the lora scale which can be set at run time by the pipeline. # if _lora_scale has not been set, return 1 - return self._lora_scale if hasattr(self, "_lora_scale") else 1 + return self._lora_scale if hasattr(self, "_lora_scale") else 1.0 @property def text_encoder_lora_attn_procs(self): From 2ad1f23d0fbd99e4f518f51bce1d648e03bb7a44 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:31:46 +0100 Subject: [PATCH 07/12] Apply suggestions from code review --- tests/models/test_lora_layers.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 949c26ef2a1c..aaacf1e68f9f 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -292,10 +292,6 @@ def test_text_encoder_lora_monkey_patch(self): del text_attn_procs gc.collect() - # set `_lora_scale` explicitly as we're not directly calling the - # pipeline here. - pipe._lora_scale = 1.0 - # inference with lora outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0] assert outputs_with_lora.shape == (1, 77, 32) @@ -344,10 +340,6 @@ def test_text_encoder_lora_remove_monkey_patch(self): del text_attn_procs gc.collect() - # set `_lora_scale` explicitly as we're not directly calling the - # pipeline here. - pipe._lora_scale = 1.0 - # inference with lora outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0] assert outputs_with_lora.shape == (1, 77, 32) From 31b24c1859213252cfe39166fe0e9bb9659c79a5 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 20:55:46 +0000 Subject: [PATCH 08/12] add lora mixin when fit --- hey.txt | 15 +++++++++++++++ .../alt_diffusion/pipeline_alt_diffusion.py | 12 ++++++++++-- .../pipelines/controlnet/pipeline_controlnet.py | 11 +++++++++-- .../controlnet/pipeline_controlnet_img2img.py | 11 +++++++++-- .../controlnet/pipeline_controlnet_inpaint.py | 11 +++++++++-- .../stable_diffusion/pipeline_cycle_diffusion.py | 11 +++++++++-- .../stable_diffusion/pipeline_stable_diffusion.py | 10 ++++++++-- ...pipeline_stable_diffusion_attend_and_excite.py | 9 ++++++++- .../pipeline_stable_diffusion_depth2img.py | 7 +++++++ .../pipeline_stable_diffusion_diffedit.py | 7 +++++++ .../pipeline_stable_diffusion_img2img.py | 7 +++++++ .../pipeline_stable_diffusion_inpaint.py | 7 +++++++ .../pipeline_stable_diffusion_inpaint_legacy.py | 7 +++++++ .../pipeline_stable_diffusion_k_diffusion.py | 11 +++++++++-- .../pipeline_stable_diffusion_model_editing.py | 11 +++++++++-- .../pipeline_stable_diffusion_panorama.py | 11 +++++++++-- .../pipeline_stable_diffusion_pix2pix_zero.py | 9 ++++++++- .../pipeline_stable_diffusion_sag.py | 9 ++++++++- .../pipeline_stable_diffusion_upscale.py | 11 +++++++++-- .../stable_diffusion/pipeline_stable_unclip.py | 11 +++++++++-- .../pipeline_stable_unclip_img2img.py | 11 +++++++++-- .../pipeline_text_to_video_synth.py | 11 +++++++++-- 22 files changed, 191 insertions(+), 29 deletions(-) create mode 100644 hey.txt diff --git a/hey.txt b/hey.txt new file mode 100644 index 000000000000..e0e25d7f2d20 --- /dev/null +++ b/hey.txt @@ -0,0 +1,15 @@ +src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +src/diffusers/pipelines/controlnet/pipeline_controlnet.py +src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 8507684cf9b4..e531df9469ce 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging, randn_tensor, replace_example_docstring @@ -52,7 +52,7 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Alt Diffusion. @@ -291,6 +291,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -315,7 +316,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -661,6 +668,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 4ac43377c82a..962549be09d0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -91,7 +91,7 @@ """ -class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -291,6 +291,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -315,7 +316,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 6667cf43ce46..bae8947ad88f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -117,7 +117,7 @@ def prepare_image(image): return image -class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -317,6 +317,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -341,7 +342,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index c20f2d518f96..3cef7dc3a9e1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -26,7 +26,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -223,7 +223,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False return mask, masked_image -class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -434,6 +434,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -458,7 +459,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 6b6df0945943..85fb9635b74d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -26,7 +26,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor @@ -126,7 +126,7 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): return noise -class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -315,6 +315,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -339,7 +340,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 864125d7f788..dabe4d986d6b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -319,8 +319,11 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale @@ -659,6 +662,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -667,7 +673,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - lora_scale=cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 64e8577438ea..2d374bf71ea3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import KarrasDiffusionSchedulers @@ -306,6 +306,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -330,7 +331,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 2fd4503a94ce..15a9b7339e41 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -183,6 +183,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -207,7 +208,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index 3c1ac58bcee4..1a776432d1c9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -487,6 +487,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -511,7 +512,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 106b6528a982..62373d7cf7c2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -309,6 +309,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -333,7 +334,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 9a6f5dbf9480..b9b91c842f0a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -378,6 +378,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -402,7 +403,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index c549d869e685..705ade8355da 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -304,6 +304,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -328,7 +329,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index 39601ac36c33..c02e62712fdc 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -21,7 +21,7 @@ from k_diffusion.sampling import get_sigmas_karras from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor @@ -45,7 +45,7 @@ def apply_model(self, *args, **kwargs): return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample -class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -210,6 +210,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -234,7 +235,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 7bc6b466b46f..605901fae408 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -20,7 +20,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import PNDMScheduler from ...schedulers.scheduling_utils import SchedulerMixin @@ -55,7 +55,7 @@ """ -class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models". @@ -237,6 +237,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -261,7 +262,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 66706c806a81..8f8d35ad3d6d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -20,7 +20,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring @@ -51,7 +51,7 @@ """ -class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation". @@ -199,6 +199,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -223,7 +224,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 3332cc89d96c..60f0a9611f44 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -30,7 +30,7 @@ ) from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler @@ -447,6 +447,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -471,7 +472,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index db3c148f04e5..26528e43a398 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -21,7 +21,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring @@ -218,6 +218,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -242,7 +243,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 6bb463a6a65f..f7c9e7ada289 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers @@ -54,7 +54,7 @@ def preprocess(image): return image -class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image super-resolution using Stable Diffusion 2. @@ -216,6 +216,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -240,7 +241,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index fb907f49553c..a1042cfd4f6e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -21,7 +21,7 @@ from transformers.models.clip.modeling_clip import CLIPTextModelOutput from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers @@ -50,7 +50,7 @@ """ -class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): """ Pipeline for text-to-image generation using stable unCLIP. @@ -338,6 +338,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -362,7 +363,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 44916049e29f..dd5a245e895d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -23,7 +23,7 @@ from diffusers.utils.import_utils import is_accelerate_available from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers @@ -63,7 +63,7 @@ """ -class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): """ Pipeline for text-guided image to image generation using stable unCLIP. @@ -238,6 +238,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -262,7 +263,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index a44b6f1d0744..ed08323d8ec6 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -19,7 +19,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -73,7 +73,7 @@ def tensor2vid(video: torch.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) - return images -class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-video generation. @@ -224,6 +224,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -248,7 +249,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): From 736eb40d7ec01e89c46aa6dc91b699a19904114e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 20:56:41 +0000 Subject: [PATCH 09/12] add lora mixin when fit --- .../pipelines/alt_diffusion/pipeline_alt_diffusion.py | 8 ++++++-- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 7 +++++++ src/diffusers/pipelines/controlnet/pipeline_controlnet.py | 3 ++- .../pipelines/controlnet/pipeline_controlnet_img2img.py | 3 ++- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 3 ++- .../stable_diffusion/pipeline_cycle_diffusion.py | 3 ++- .../pipeline_stable_diffusion_attend_and_excite.py | 3 ++- .../pipeline_stable_diffusion_depth2img.py | 3 ++- .../pipeline_stable_diffusion_diffedit.py | 3 ++- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 3 ++- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 3 ++- .../pipeline_stable_diffusion_inpaint_legacy.py | 3 ++- .../pipeline_stable_diffusion_k_diffusion.py | 3 ++- .../pipeline_stable_diffusion_model_editing.py | 3 ++- .../pipeline_stable_diffusion_panorama.py | 3 ++- .../pipeline_stable_diffusion_pix2pix_zero.py | 3 ++- .../stable_diffusion/pipeline_stable_diffusion_sag.py | 3 ++- .../stable_diffusion/pipeline_stable_diffusion_upscale.py | 3 ++- .../pipelines/stable_diffusion/pipeline_stable_unclip.py | 3 ++- .../stable_diffusion/pipeline_stable_unclip_img2img.py | 3 ++- .../pipeline_text_to_video_synth.py | 3 ++- 21 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index e531df9469ce..0109600dd502 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -319,7 +319,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale @@ -660,6 +661,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -668,7 +672,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - lora_scale=cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index f0d4d91ce966..a3ec9552f13d 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -302,6 +302,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -326,7 +327,13 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 962549be09d0..bf692c146b4f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -319,7 +319,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index bae8947ad88f..f271f4998b35 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -345,7 +345,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 3cef7dc3a9e1..66fecd7cd9f8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -462,7 +462,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 85fb9635b74d..feb9f4522f66 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -343,7 +343,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 2d374bf71ea3..f76268463707 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -334,7 +334,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 15a9b7339e41..bd5ad6d0c0f4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -211,7 +211,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index 1a776432d1c9..d20984478045 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -515,7 +515,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 62373d7cf7c2..de2ebb452749 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -337,7 +337,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index b9b91c842f0a..c8ce7bd6f472 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -406,7 +406,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 705ade8355da..0c033748c214 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -332,7 +332,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index c02e62712fdc..e2800342e578 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -238,7 +238,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 605901fae408..ea44025c58b0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -265,7 +265,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 8f8d35ad3d6d..b1f621a70f54 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -227,7 +227,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 60f0a9611f44..75ac4f777756 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -475,7 +475,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 26528e43a398..ba1c0d2b9d49 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -246,7 +246,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index f7c9e7ada289..c84034141a59 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -244,7 +244,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index a1042cfd4f6e..a464ce8ba038 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -366,7 +366,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index dd5a245e895d..5f44ee6d8eeb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -266,7 +266,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index ed08323d8ec6..adf7524241a6 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -252,7 +252,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale From fa227c8104885bfcaf21108a62cba327b61d85e9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:15:19 +0000 Subject: [PATCH 10/12] add lora mixin when fit --- CONTRIBUTING.md | 8 +++---- PHILOSOPHY.md | 20 +++++++++--------- README.md | 18 ++++++++-------- docs/source/_config.py | 2 +- hey.txt | 16 +++----------- .../pipeline_alt_diffusion_img2img.py | 11 +++++++--- .../controlnet/pipeline_controlnet.py | 4 ++++ .../controlnet/pipeline_controlnet_img2img.py | 4 ++++ .../controlnet/pipeline_controlnet_inpaint.py | 4 ++++ .../pipeline_cycle_diffusion.py | 16 ++++++++++++-- .../pipeline_stable_diffusion_depth2img.py | 21 +++++++++++++++---- .../pipeline_stable_diffusion_diffedit.py | 5 +++++ .../pipeline_stable_diffusion_img2img.py | 4 ++++ .../pipeline_stable_diffusion_inpaint.py | 4 ++++ ...ipeline_stable_diffusion_inpaint_legacy.py | 21 +++++++++++++++---- .../pipeline_stable_diffusion_k_diffusion.py | 2 +- ...pipeline_stable_diffusion_model_editing.py | 4 ++++ .../pipeline_stable_diffusion_panorama.py | 4 ++++ .../pipeline_stable_diffusion_upscale.py | 12 ++++++++++- .../pipeline_stable_unclip.py | 4 ++++ .../pipeline_stable_unclip_img2img.py | 4 ++++ .../pipeline_text_to_video_synth.py | 4 ++++ 22 files changed, 140 insertions(+), 52 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5ce48793e9c2..9c5f0a10edd3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,14 +125,14 @@ Awesome! Tell us what problem it solved for you. You can open a feature request [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=). -#### 2.3 Feedback. +#### 2.3 Feedback. Feedback about the library design and why it is good or not good helps the core maintainers immensely to build a user-friendly library. To understand the philosophy behind the current design philosophy, please have a look [here](https://huggingface.co/docs/diffusers/conceptual/philosophy). If you feel like a certain design choice does not fit with the current design philosophy, please explain why and how it should be changed. If a certain design choice follows the design philosophy too much, hence restricting use cases, explain why and how it should be changed. If a certain design choice is very useful for you, please also leave a note as this is great feedback for future design decisions. You can open an issue about feedback [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=). -#### 2.4 Technical questions. +#### 2.4 Technical questions. Technical questions are mainly about why certain code of the library was written in a certain way, or what a certain part of the code does. Please make sure to link to the code in question and please provide detail on why this part of the code is difficult to understand. @@ -394,8 +394,8 @@ passes. You should run the tests impacted by your changes like this: ```bash $ pytest tests/.py ``` - -Before you run the tests, please make sure you install the dependencies required for testing. You can do so + +Before you run the tests, please make sure you install the dependencies required for testing. You can do so with this command: ```bash diff --git a/PHILOSOPHY.md b/PHILOSOPHY.md index fbad5948e17e..399cb0bfb47d 100644 --- a/PHILOSOPHY.md +++ b/PHILOSOPHY.md @@ -27,18 +27,18 @@ In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefor ## Simple over easy -As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library: +As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library: - We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management. - Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible. - Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers. -- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training +- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training is very simple thanks to diffusers' ability to separate single components of the diffusion pipeline. ## Tweakable, contributor-friendly over abstraction -For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself). +For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself). In short, just like Transformers does for modeling files, diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers. -Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable. +Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable. **However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because: - Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions. - Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions. @@ -47,10 +47,10 @@ Functions, long code blocks, and even classes can be copied across multiple file At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look at [this blog post](https://huggingface.co/blog/transformers-design-philosophy). -In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such +In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such as [DDPM](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [UnCLIP (Dalle-2)](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/unclip#overview) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models#diffusers.UNet2DConditionModel). -Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗. +Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗. We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️ to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=). ## Design Philosophy in Details @@ -89,7 +89,7 @@ The following design principles are followed: - Models should by default have the highest precision and lowest performance setting. - To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different. - Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work. -- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and +- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). ### Schedulers @@ -97,9 +97,9 @@ readable longterm, such as [UNet blocks](https://github.com/huggingface/diffuser Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**. The following design principles are followed: -- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). -- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. -- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper). +- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). +- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. +- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper). - If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism. - Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`. - Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./using-diffusers/schedulers.mdx). diff --git a/README.md b/README.md index ab37c629102a..c2a3b04b57a8 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ We recommend installing 🤗 Diffusers in a virtual environment from PyPi or Con ### PyTorch With `pip` (official package): - + ```bash pip install --upgrade diffusers[torch] ``` @@ -107,7 +107,7 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. | ## Contribution -We ❤️ contributions from the open-source community! +We ❤️ contributions from the open-source community! If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md). You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library. - See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute @@ -128,7 +128,7 @@ just hang out ☕. Unconditional Image Generation - DDPM + DDPM google/ddpm-ema-church-256 @@ -185,13 +185,13 @@ just hang out ☕. ## Popular libraries using 🧨 Diffusers -- https://github.com/microsoft/TaskMatrix -- https://github.com/invoke-ai/InvokeAI -- https://github.com/apple/ml-stable-diffusion -- https://github.com/Sanster/lama-cleaner +- https://github.com/microsoft/TaskMatrix +- https://github.com/invoke-ai/InvokeAI +- https://github.com/apple/ml-stable-diffusion +- https://github.com/Sanster/lama-cleaner - https://github.com/IDEA-Research/Grounded-Segment-Anything -- https://github.com/ashawkey/stable-dreamfusion -- https://github.com/deep-floyd/IF +- https://github.com/ashawkey/stable-dreamfusion +- https://github.com/deep-floyd/IF - https://github.com/bentoml/BentoML - https://github.com/bmaltais/kohya_ss - +3000 other amazing GitHub repositories 💪 diff --git a/docs/source/_config.py b/docs/source/_config.py index 9a4818ea8b1e..3d0d73dcb951 100644 --- a/docs/source/_config.py +++ b/docs/source/_config.py @@ -6,4 +6,4 @@ # ! pip install git+https://github.com/huggingface/diffusers.git """ -notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] \ No newline at end of file +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] diff --git a/hey.txt b/hey.txt index e0e25d7f2d20..6efc917ac025 100644 --- a/hey.txt +++ b/hey.txt @@ -1,15 +1,5 @@ -src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py -src/diffusers/pipelines/controlnet/pipeline_controlnet.py -src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py -src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py -src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py + diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index a3ec9552f13d..cbd64d5cd1ff 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -26,7 +26,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring @@ -95,7 +95,7 @@ def preprocess(image): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Alt Diffusion. @@ -330,7 +330,8 @@ def _encode_prompt( lora_scale (`float`, *optional*): A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ - # set lora scale so that monkey patched LoRA function of text encoder can correctly access it + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it if lora_scale is not None and isinstance(self, LoraLoaderMixin): self._lora_scale = lora_scale @@ -713,6 +714,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -721,6 +725,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index bf692c146b4f..f5ea6036a4e0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -846,6 +846,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -854,6 +857,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index f271f4998b35..638e8ce7acb1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -937,6 +937,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -945,6 +948,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image image = self.image_processor.preprocess(image).to(dtype=torch.float32) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 66fecd7cd9f8..b357f0d1cbd3 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1139,6 +1139,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -1147,6 +1150,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index feb9f4522f66..ead4a2a635ba 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -637,6 +637,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -693,6 +694,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -713,12 +718,16 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, prompt_embeds=prompt_embeds, + lora_scale=text_encoder_lora_scale, ) source_prompt_embeds = self._encode_prompt( source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None @@ -772,7 +781,10 @@ def __call__( dim=0, ) concat_noise_pred = self.unet( - concat_latent_model_input, t, encoder_hidden_states=concat_prompt_embeds + concat_latent_model_input, + t, + cross_attention_kwargs=cross_attention_kwargs, + encoder_hidden_states=concat_prompt_embeds, ).sample # perform guidance diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index bd5ad6d0c0f4..a126066ebdda 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -15,7 +15,7 @@ import contextlib import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -554,6 +554,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -614,6 +615,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Examples: @@ -673,6 +678,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -681,6 +689,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare depth mask @@ -719,9 +728,13 @@ def __call__( latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1) # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[ - 0 - ] + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index d20984478045..d79fba03d5ee 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -1015,6 +1015,7 @@ def generate_mask( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompts + (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None,) target_prompt_embeds = self._encode_prompt( target_prompt, device, @@ -1466,6 +1467,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -1474,6 +1478,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess mask diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index de2ebb452749..e74ce46d7412 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -722,6 +722,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -730,6 +733,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index c8ce7bd6f472..0fb1b2624968 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -906,6 +906,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -914,6 +917,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. set timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 0c033748c214..9e78e290a143 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -583,6 +583,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -647,6 +648,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -673,6 +678,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -681,6 +689,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image and mask @@ -716,9 +725,13 @@ def __call__( latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[ - 0 - ] + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index e2800342e578..ab613dd4dfe4 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -45,7 +45,7 @@ def apply_model(self, *args, **kwargs): return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample -class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index ea44025c58b0..1a29320ee924 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -727,6 +727,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -735,6 +738,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index b1f621a70f54..e9dc04f08a43 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -589,6 +589,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -597,6 +600,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index c84034141a59..0d1ae2eb8377 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -506,6 +506,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -560,6 +561,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Examples: ```py @@ -624,6 +629,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -632,6 +640,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image @@ -695,6 +704,7 @@ def __call__( latent_model_input, t, encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, class_labels=noise_level, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index a464ce8ba038..b5a105ac5b99 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -864,6 +864,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 8. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt=prompt, device=device, @@ -872,6 +875,7 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 9. Prepare image embeddings diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 5f44ee6d8eeb..ad79e0c65e3d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -760,6 +760,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt=prompt, device=device, @@ -768,6 +771,7 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Encoder input image diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index adf7524241a6..5bdec979827f 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -599,6 +599,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -607,6 +610,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps From c9225839eb0d5feb958e4488aa786bfd4ff84f6f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:16:25 +0000 Subject: [PATCH 11/12] fix more --- hey.txt | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 hey.txt diff --git a/hey.txt b/hey.txt deleted file mode 100644 index 6efc917ac025..000000000000 --- a/hey.txt +++ /dev/null @@ -1,5 +0,0 @@ -src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py -src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py - From 151cf970c44f8be69994e4bc157d46a2a51a7afe Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 21:34:09 +0000 Subject: [PATCH 12/12] fix more --- src/diffusers/loaders.py | 5 +++-- .../pipelines/alt_diffusion/pipeline_alt_diffusion.py | 2 +- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 2 +- src/diffusers/pipelines/controlnet/pipeline_controlnet.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_img2img.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 2 +- .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py | 2 +- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_depth2img.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_diffedit.py | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../pipeline_stable_diffusion_model_editing.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_panorama.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_upscale.py | 2 +- .../pipelines/stable_diffusion/pipeline_stable_unclip.py | 2 +- .../stable_diffusion/pipeline_stable_unclip_img2img.py | 2 +- .../text_to_video_synthesis/pipeline_text_to_video_synth.py | 2 +- 19 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 3c8c3e0bb2ef..7ed225bbcdf6 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -853,7 +853,7 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di use_safetensors = kwargs.pop("use_safetensors", None) # set lora scale to a reasonable default - self._lora_scale = 1 + self._lora_scale = 1.0 if use_safetensors and not is_safetensors_available(): raise ValueError( @@ -1009,7 +1009,8 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 def make_new_forward(old_forward, lora_layer): def new_forward(x): - return old_forward(x) + self.lora_scale * lora_layer(x) + result = old_forward(x) + self.lora_scale * lora_layer(x) + return result return new_forward diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 0109600dd502..64ca06a53a7b 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -662,7 +662,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index cbd64d5cd1ff..5903f97aca36 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -715,7 +715,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index f5ea6036a4e0..89398b6f01f9 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -847,7 +847,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 638e8ce7acb1..0e984d8ae5e3 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -938,7 +938,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index b357f0d1cbd3..5ce2fd5543b8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1140,7 +1140,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index ead4a2a635ba..b8360f512405 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -719,7 +719,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index dabe4d986d6b..f7374452a5f6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -663,7 +663,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index a126066ebdda..002014681040 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -679,7 +679,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index d79fba03d5ee..837811baae64 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -1015,7 +1015,7 @@ def generate_mask( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompts - (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None,) + (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None) target_prompt_embeds = self._encode_prompt( target_prompt, device, @@ -1468,7 +1468,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e74ce46d7412..e9e91b646ed5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -723,7 +723,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 0fb1b2624968..b07a5555f1c7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -907,7 +907,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 9e78e290a143..147d914fe6c1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -679,7 +679,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 1a29320ee924..1d30b9ee0347 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -728,7 +728,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index e9dc04f08a43..07459a49a262 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -590,7 +590,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 0d1ae2eb8377..84f60a38c7ab 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -630,7 +630,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index b5a105ac5b99..e36ebfbb70f1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -865,7 +865,7 @@ def __call__( # 8. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt=prompt, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index ad79e0c65e3d..0187c86b4239 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -761,7 +761,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt=prompt, diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 5bdec979827f..8bf4bafa4fe5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -600,7 +600,7 @@ def __call__( # 3. Encode input prompt text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None, + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None ) prompt_embeds = self._encode_prompt( prompt,