From 8e871c6fd69f6c661748cd98ee0710dc7ab5bc44 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 22 Jul 2023 00:09:14 +0530 Subject: [PATCH 01/98] empty PR --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/diffusers/pipelines/fabric/pipeline_fabric.py diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py new file mode 100644 index 000000000000..e69de29bb2d1 From fbc3af9548f31be8203d37b94980ae1809d397f8 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 30 Jul 2023 11:27:09 +0530 Subject: [PATCH 02/98] init --- src/diffusers/pipelines/fabric/__init__.py | 23 +++++++++++++++++++ .../pipelines/fabric/pipeline_fabric.py | 15 ++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/diffusers/pipelines/fabric/__init__.py diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py new file mode 100644 index 000000000000..d5f7eb6b4fcc --- /dev/null +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -0,0 +1,23 @@ +from ...utils import ( + OptionalDependencyNotAvailable, + is_flax_available, + is_torch_available, + is_transformers_available, +) + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 +else: + from .multicontrolnet import MultiControlNetModel + from .pipeline_controlnet import StableDiffusionControlNetPipeline + from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline + from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline + from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline + + +if is_transformers_available() and is_flax_available(): + from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index e69de29bb2d1..0bae7300e5f3 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -0,0 +1,15 @@ +import warnings +from typing import List, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from PIL import Image +from tqdm import tqdm +from diffusers import ( + StableDiffusionPipeline, + EulerAncestralDiscreteScheduler, +) +from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.cross_attention import LoRACrossAttnProcessor From 2c8a0213817539d2faf0852c4b98b72f266c9e73 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 1 Aug 2023 16:48:59 +0530 Subject: [PATCH 03/98] changes --- src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 2 ++ src/diffusers/pipelines/fabric/__init__.py | 12 +----------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index c7c7ac6fe859..45331ddfa5be 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -131,6 +131,7 @@ AltDiffusionPipeline, AudioLDMPipeline, CycleDiffusionPipeline, + FabricPipeline, IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 802ae4f5bc94..00cca6f7bbff 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -51,6 +51,8 @@ StableDiffusionControlNetInpaintPipeline, StableDiffusionControlNetPipeline, ) + from .fabric import FabricPipeline + from .deepfloyd_if import ( IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index d5f7eb6b4fcc..e681ff9f1aeb 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -1,23 +1,13 @@ from ...utils import ( OptionalDependencyNotAvailable, - is_flax_available, is_torch_available, - is_transformers_available, ) - try: if not (is_transformers_available() and is_torch_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .multicontrolnet import MultiControlNetModel - from .pipeline_controlnet import StableDiffusionControlNetPipeline - from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline - from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline - from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline - + from .pipeline_fabric import FabricPipeline -if is_transformers_available() and is_flax_available(): - from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline From ccfc546d5690ee6c0e6bab8548ff0252296783ac Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 2 Aug 2023 23:56:07 +0530 Subject: [PATCH 04/98] starting with the pipeline --- docs/source/en/_toctree.yml | 2 + .../api/pipelines/stable_diffusion/fabric.mdx | 50 +++++++++++++ .../pipelines/fabric/pipeline_fabric.py | 73 ++++++++++++++++--- 3 files changed, 113 insertions(+), 12 deletions(-) create mode 100644 docs/source/en/api/pipelines/stable_diffusion/fabric.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f913341c4858..19c2e60a7d3a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -206,6 +206,8 @@ title: DiffEdit - local: api/pipelines/dit title: DiT + - local: api/pipelines/fabric + title: FABRIC - local: api/pipelines/pix2pix title: InstructPix2Pix - local: api/pipelines/kandinsky diff --git a/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx b/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx new file mode 100644 index 000000000000..dc4996614ae0 --- /dev/null +++ b/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx @@ -0,0 +1,50 @@ +## changes required + + +# Text-to-Image Generation + +## StableDiffusionPipeline + +The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion. + +The original codebase can be found here: +- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) +- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion) + +Available Checkpoints are: +- *stable-diffusion-v1-4 (512x512 resolution)* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) +- *stable-diffusion-v1-5 (512x512 resolution)* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) +- *stable-diffusion-2-base (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) +- *stable-diffusion-2 (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) +- *stable-diffusion-2-1-base (512x512 resolution)* [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) +- *stable-diffusion-2-1 (768x768 resolution)*: [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) + +[[autodoc]] StableDiffusionPipeline + - all + - __call__ + - enable_attention_slicing + - disable_attention_slicing + - enable_vae_slicing + - disable_vae_slicing + - enable_xformers_memory_efficient_attention + - disable_xformers_memory_efficient_attention + - enable_vae_tiling + - disable_vae_tiling + - load_textual_inversion + - from_single_file + - load_lora_weights + - save_lora_weights + +[[autodoc]] FlaxStableDiffusionPipeline + - all + - __call__ diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 0bae7300e5f3..324c94bfcf12 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,15 +1,64 @@ -import warnings -from typing import List, Optional, Union +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -from PIL import Image -from tqdm import tqdm -from diffusers import ( - StableDiffusionPipeline, - EulerAncestralDiscreteScheduler, +from torch import nn +from torch.nn import functional as F + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, logging +from .cross_attention import AttnProcessor +from .embeddings import TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin +from .unet_2d_blocks import ( + CrossAttnDownBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + get_down_block, ) -from diffusers.models.attention import BasicTransformerBlock -from diffusers.models.cross_attention import LoRACrossAttnProcessor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class AttentionBasedGenerator(nn.Module): + def __init__(self): + super().__init__() + pass + + def generate( + self, + prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", + negative_prompt: Union[str, List[str]] = "", + liked: List[Image.Image] = [], + disliked: List[Image.Image] = [], + seed: int = 42, + n_images: int = 1, + guidance_scale: float = 8.0, + denoising_steps: int = 20, + feedback_start: float = 0.33, + feedback_end: float = 0.66, + min_weight: float = 0.1, + max_weight: float = 1.0, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + ) + pass + + + + From c815605d3f9f68eefca5d3e56078fda8ec54c043 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 3 Aug 2023 10:32:34 +0530 Subject: [PATCH 05/98] stable diff --- .../pipelines/fabric/pipeline_fabric.py | 69 +++++++++++++++++-- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 324c94bfcf12..d29e352decdf 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -20,7 +20,10 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, logging -from .cross_attention import AttnProcessor +from .cross_attention import LoRACrossAttnProcessor +from .attention import BasicTransformerBlock +from .pipelines import StableDiffusionPipeline +from .scheduler import EulerAncestralDiscreateScheduler from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( @@ -35,16 +38,66 @@ class AttentionBasedGenerator(nn.Module): - def __init__(self): + def __init__( + self, + model_name: Optional[str] = None, + model_ckpt: Optional[str] = None, + stable_diffusion_version: str = "1.5", + lora_weights: Optional[str] = None, + torch_dtype=torch.float32, + ): super().__init__() - pass + if stable_diffusion_version == "2.1": + warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") + + if model_name is None: + if stable_diffusion_version == "1.5": + model_name = "runwayml/stable-diffusion-v1-5" + elif stable_diffusion_version == "2.1": + model_name = "stabilityai/stable-diffusion-2-1" + else: + raise ValueError( + f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" + ) + + scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") + + if model_ckpt is not None: + pipe = StableDiffusionPipeline.from_ckpt( + model_ckpt, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) + pipe.scheduler = scheduler + else: + pipe = StableDiffusionPipeline.from_pretrained( + model_name, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) + + if lora_weights: + print(f"Applying LoRA weights from {lora_weights}") + apply_unet_lora_weights( + pipeline=pipe, unet_path=lora_weights + ) + + self.pipeline = pipe + self.unet = pipe.unet + self.vae = pipe.vae + self.text_encoder = pipe.text_encoder + self.tokenizer = pipe.tokenizer + self.scheduler = scheduler + self.dtype = torch_dtype def generate( self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Union[str, List[str]] = "", - liked: List[Image.Image] = [], - disliked: List[Image.Image] = [], + negative_prompt: Optional[Union[str, List[str]]] = "", + liked: Optional[List[Image.Image]] = [], + disliked: Optional[List[Image.Image]] = [], seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -57,6 +110,10 @@ def generate( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ) + + with tqdm(total=denoising_steps) as pbar: + for i, t in enumerate(timestamp): + pass From b65184b29e9a7e91955b6bb587c0fbdcd07bda75 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 3 Aug 2023 20:45:52 +0530 Subject: [PATCH 06/98] prev --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index d29e352decdf..ca671475db87 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -37,7 +37,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class AttentionBasedGenerator(nn.Module): +class FabricModel(nn.Module): def __init__( self, model_name: Optional[str] = None, @@ -47,6 +47,7 @@ def __init__( torch_dtype=torch.float32, ): super().__init__() + # Getting UNet from Stable diffusion if stable_diffusion_version == "2.1": warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") @@ -92,7 +93,7 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - def generate( + def forward( self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", negative_prompt: Optional[Union[str, List[str]]] = "", From 1c4833c87f393b4030a880f7da3d707a5c3cd8c5 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 5 Aug 2023 02:08:19 +0530 Subject: [PATCH 07/98] more things, getting started --- .../pipelines/fabric/pipeline_fabric.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ca671475db87..44afc884fd4d 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -111,6 +111,29 @@ def forward( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ) + if seed is not None: + torch.manual_seed(seed) + + if liked and len(liked) > 0: + pass + else: + pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) + + if disliked and len(disliked) > 0: + pass + else: + neg_latents = torch.Tensor([], device=self.device, dtype=self.dtype) + + if isinstance(prompt, str): + prompt = [prompt] * n_images + else: + assert len(prompts) == n_images + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] * n_images + else: + assert len(negative_prompts) == n_images + with tqdm(total=denoising_steps) as pbar: for i, t in enumerate(timestamp): From c9eb420199461e4c552e5f6dae6e7e69704d5d4c Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 5 Aug 2023 11:30:47 +0530 Subject: [PATCH 08/98] more functions --- src/diffusers/pipelines/fabric/__init__.py | 2 +- .../pipelines/fabric/pipeline_fabric.py | 417 +++++++++++++++++- 2 files changed, 395 insertions(+), 24 deletions(-) diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index e681ff9f1aeb..db76db7bc0bf 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -4,7 +4,7 @@ ) try: - if not (is_transformers_available() and is_torch_available()): + if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 44afc884fd4d..017c7a1ccfc8 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -18,12 +18,12 @@ from torch import nn from torch.nn import functional as F -from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, logging -from .cross_attention import LoRACrossAttnProcessor -from .attention import BasicTransformerBlock -from .pipelines import StableDiffusionPipeline -from .scheduler import EulerAncestralDiscreateScheduler +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils import BaseOutput, logging +from ...models.cross_attention import LoRACrossAttnProcessor +from ...models.attention import BasicTransformerBlock +from ..stable_diffusion import StableDiffusionPipeline +from ...schedulers import EulerAncestralDiscreateScheduler from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( @@ -36,18 +36,118 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name +def apply_unet_lora_weights(pipeline, unet_path): + model_weight = torch.load(unet_path, map_location="cpu") + unet = pipeline.unet + lora_attn_procs = {} + lora_rank = list( + set([v.size(0) for k, v in model_weight.items() if k.endswith("down.weight")]) + ) + assert len(lora_rank) == 1 + lora_rank = lora_rank[0] + for name in unet.attn_processors.keys(): + cross_attention_dim = ( + None + if name.endswith("attn1.processor") + else unet.config.cross_attention_dim + ) + if name.startswith("mid_block"): + hidden_size = unet.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = unet.config.block_out_channels[block_id] + + lora_attn_procs[name] = LoRACrossAttnProcessor( + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=lora_rank, + ).to(pipeline.device) + unet.set_attn_processor(lora_attn_procs) + unet.load_state_dict(model_weight, strict=False) + + +def attn_with_weights( + attn: nn.Module, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + weights=None, # shape: (batch_size, sequence_length) + lora_scale=1.0, +): + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size + ) + + if isinstance(attn.processor, LoRACrossAttnProcessor): + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( + hidden_states + ) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if isinstance(attn.processor, LoRACrossAttnProcessor): + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( + encoder_hidden_states + ) + value = attn.to_v( + encoder_hidden_states + ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + + if weights is not None: + if weights.shape[0] != 1: + weights = weights.repeat_interleave(attn.heads, dim=0) + attention_probs = attention_probs * weights[:, None] + attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + if isinstance(attn.processor, LoRACrossAttnProcessor): + hidden_states = attn.to_out[0]( + hidden_states + ) + lora_scale * attn.processor.to_out_lora(hidden_states) + else: + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + -class FabricModel(nn.Module): +class Fabric(nn.Module): def __init__( self, model_name: Optional[str] = None, model_ckpt: Optional[str] = None, stable_diffusion_version: str = "1.5", lora_weights: Optional[str] = None, - torch_dtype=torch.float32, + torch_dtype=torch.float32 ): super().__init__() - # Getting UNet from Stable diffusion + if stable_diffusion_version == "2.1": warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") @@ -93,12 +193,166 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype + @property + def device(self): + return next(self.parameters()).device + + def to(self, device): + self.pipeline.to(device) + return super().to(device) + + def initialize_prompts(self, prompts: List[str]): + prompt_tokens = self.tokenizer( + prompts, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + return prompt_embd + + def get_unet_hidden_states(self, z_all, t, prompt_embd): + cached_hidden_states = [] + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward(self, hidden_states, *args, **kwargs): + cached_hidden_states.append(hidden_states.clone().detach().cpu()) + return self.old_forward(hidden_states, *args, **kwargs) + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + # run forward pass to cache hidden states, output can be discarded + _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return cached_hidden_states + + def unet_forward_with_cached_hidden_states( + self, + z_all, + t, + prompt_embd, + cached_pos_hiddens: Optional[List[torch.Tensor]] = None, + cached_neg_hiddens: Optional[List[torch.Tensor]] = None, + pos_weights=(0.8, 0.8), + neg_weights=(0.5, 0.5), + ): + if cached_pos_hiddens is None and cached_neg_hiddens is None: + return self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + local_pos_weights = torch.linspace( + *pos_weights, steps=len(self.unet.down_blocks) + 1 + )[:-1].tolist() + local_neg_weights = torch.linspace( + *neg_weights, steps=len(self.unet.down_blocks) + 1 + )[:-1].tolist() + + for block, pos_weight, neg_weight in zip( + self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, + local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], + local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1], + ): + for module in block.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward( + self, + hidden_states, + pos_weight=pos_weight, + neg_weight=neg_weight, + **kwargs, + ): + cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0) + batch_size, d_model = cond_hiddens.shape[:2] + device, dtype = hidden_states.device, hidden_states.dtype + + weights = torch.ones( + batch_size, d_model, device=device, dtype=dtype + ) + + if cached_pos_hiddens is not None: + cached_pos_hs = cached_pos_hiddens.pop(0).to( + hidden_states.device + ) + cond_pos_hs = torch.cat( + [cond_hiddens, cached_pos_hs], dim=1 + ) + pos_weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + pos_weights[:, d_model:] = pos_weight + out_pos = attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_pos_hs, + weights=pos_weights, + ) + else: + out_pos = self.old_forward(cond_hiddens) + + if cached_neg_hiddens is not None: + cached_neg_hs = cached_neg_hiddens.pop(0).to( + hidden_states.device + ) + uncond_neg_hs = torch.cat( + [uncond_hiddens, cached_neg_hs], dim=1 + ) + neg_weights = weights.clone().repeat( + 1, 1 + cached_neg_hs.shape[1] // d_model + ) + neg_weights[:, d_model:] = neg_weight + out_neg = attn_with_weights( + self, + uncond_hiddens, + encoder_hidden_states=uncond_neg_hs, + weights=neg_weights, + ) + else: + out_neg = self.old_forward(uncond_hiddens) + + out = torch.cat([out_pos, out_neg], dim=0) + return out + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + out = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return out + def forward( - self, + self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Optional[Union[str, List[str]]] = "", - liked: Optional[List[Image.Image]] = [], - disliked: Optional[List[Image.Image]] = [], + negative_prompt: Union[str, List[str]] = "", + liked: List[Image.Image] = [], + disliked: List[Image.Image] = [], seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -110,36 +364,153 @@ def forward( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - ) + ): + """ + Generate a trajectory of images with binary feedback. + The feedback can be given as a list of liked and disliked images. + """ if seed is not None: torch.manual_seed(seed) + z = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + if liked and len(liked) > 0: - pass + pos_images = [self.image_to_tensor(img) for img in liked] + pos_images = torch.stack(pos_images).to(self.device, dtype=self.dtype) + pos_latents = ( + self.vae.config.scaling_factor + * self.vae.encode(pos_images).latent_dist.sample() + ) else: pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) if disliked and len(disliked) > 0: - pass + neg_images = [self.image_to_tensor(img) for img in disliked] + neg_images = torch.stack(neg_images).to(self.device, dtype=self.dtype) + neg_latents = ( + self.vae.config.scaling_factor + * self.vae.encode(neg_images).latent_dist.sample() + ) else: - neg_latents = torch.Tensor([], device=self.device, dtype=self.dtype) + neg_latents = torch.tensor([], device=self.device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images else: - assert len(prompts) == n_images - + assert len(prompt) == n_images if isinstance(negative_prompt, str): negative_prompt = [negative_prompt] * n_images else: - assert len(negative_prompts) == n_images + assert len(negative_prompt) == n_images + + ( + cond_prompt_embs, + uncond_prompt_embs, + null_prompt_emb, + ) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) + self.scheduler.set_timesteps(denoising_steps, device=self.device) + timesteps = self.scheduler.timesteps + + z = z * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start) + ref_end_idx = round(len(timesteps) * feedback_end) with tqdm(total=denoising_steps) as pbar: - for i, t in enumerate(timestamp): - - pass + for i, t in enumerate(timesteps): + if hasattr(self.scheduler, "sigma_t"): + sigma = self.scheduler.sigma_t[t] + elif hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + else: + sigma = 0 + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(z, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([pos_latents, neg_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight = max_weight + else: + weight = min_weight + pos_ws = (weight, weight * pos_bottleneck_scale) + neg_ws = (weight * neg_scale, weight * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = ( + alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise + ) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat([null_prompt_emb] * (pos_latents.size(0) + neg_latents.size(0)), dim=0) + + cached_hidden_states = self.get_unet_hidden_states( + z_ref_noised, t, ref_prompt_embd + ) + + n_pos, n_neg = pos_latents.shape[0], neg_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view( + 1, -1, *cached_pos.shape[2:] + ).expand(n_images, -1, -1) + cached_neg = cached_neg.view( + 1, -1, *cached_neg.shape[2:] + ).expand(n_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None + + unet_out = self.unet_forward_with_cached_hidden_states( + z_all, + t, + prompt_embd=batched_prompt_embd, + cached_pos_hiddens=cached_pos_hs, + cached_neg_hiddens=cached_neg_hs, + pos_weights=pos_ws, + neg_weights=neg_ws, + ).sample + + noise_cond, noise_uncond = unet_out.chunk(2) + guidance = noise_cond - noise_uncond + noise_pred = noise_uncond + guidance_scale * guidance + z = self.scheduler.step(noise_pred, t, z).prev_sample + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + pbar.update() + y = self.pipeline.decode_latents(z) + imgs = self.pipeline.numpy_to_pil(y) + return imgs + @staticmethod + def image_to_tensor(image: Union[str, Image.Image]): + """ + Convert a PIL image to a torch tensor. + """ + if isinstance(image, str): + image = Image.open(image) + if not image.mode == "RGB": + image = image.convert("RGB") + image = image.resize((512, 512)) + image = np.array(image).astype(np.uint8) + image = (image / 127.5 - 1.0).astype(np.float32) + return torch.from_numpy(image).permute(2, 0, 1) From 7bec286f576271e3fddb96a0bc1448e7faaa478a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 9 Aug 2023 16:34:53 +0530 Subject: [PATCH 09/98] makeing it more readable --- .../pipelines/fabric/pipeline_fabric.py | 111 ++++++++---------- 1 file changed, 49 insertions(+), 62 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 017c7a1ccfc8..af02389c05eb 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -20,6 +20,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging +from ...pipelines import DissusionPipeline from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock from ..stable_diffusion import StableDiffusionPipeline @@ -137,12 +138,12 @@ def attn_with_weights( return hidden_states -class Fabric(nn.Module): +class Fabric(DiffusionPipeline): def __init__( self, model_name: Optional[str] = None, - model_ckpt: Optional[str] = None, stable_diffusion_version: str = "1.5", + scheduler: EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, torch_dtype=torch.float32 ): @@ -163,21 +164,12 @@ def __init__( scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - if model_ckpt is not None: - pipe = StableDiffusionPipeline.from_ckpt( - model_ckpt, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ) - pipe.scheduler = scheduler - else: - pipe = StableDiffusionPipeline.from_pretrained( - model_name, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ) + pipe = StableDiffusionPipeline.from_pretrained( + model_name, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -347,18 +339,29 @@ def new_forward( return out - def forward( + def preprocess_feedback_images(images, vae) -> torch.tensor: + images_t = [self.image_to_tensor(img) for img in images] + images_t = torch.stack(images_t).to(self.device, dtype=self.dtype) + latents = ( + vae.config.scaling_factor + * vae.encode(iamges_t).latent_dist.sample() + ) + return latents + + @torch.no_grad() + + def __call__( self, - prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Union[str, List[str]] = "", - liked: List[Image.Image] = [], - disliked: List[Image.Image] = [], - seed: int = 42, + prompt: Optional[Union[str, List[str]]] = None, + negative_prompt: Optional[Union[str, List[str]]] = None, + liked: Optional[List[Image.Image]] = None, + disliked: Optional[List[Image.Image]] = None, + random_seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, denoising_steps: int = 20, - feedback_start: float = 0.33, - feedback_end: float = 0.66, + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, min_weight: float = 0.1, max_weight: float = 1.0, neg_scale: float = 0.5, @@ -369,30 +372,15 @@ def forward( Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if seed is not None: + + if random_seed is not None: torch.manual_seed(seed) - z = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) - if liked and len(liked) > 0: - pos_images = [self.image_to_tensor(img) for img in liked] - pos_images = torch.stack(pos_images).to(self.device, dtype=self.dtype) - pos_latents = ( - self.vae.config.scaling_factor - * self.vae.encode(pos_images).latent_dist.sample() - ) - else: - pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) - - if disliked and len(disliked) > 0: - neg_images = [self.image_to_tensor(img) for img in disliked] - neg_images = torch.stack(neg_images).to(self.device, dtype=self.dtype) - neg_latents = ( - self.vae.config.scaling_factor - * self.vae.encode(neg_images).latent_dist.sample() - ) - else: - neg_latents = torch.tensor([], device=self.device, dtype=self.dtype) + positive_letents = self.preprocess_feedback_images(liked,self.vae) if liked and len(liked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + + negative_letents = self.preprocess_feedback_images(disliked,self.vae) if disliked and len(disliked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -403,43 +391,42 @@ def forward( else: assert len(negative_prompt) == n_images - ( - cond_prompt_embs, - uncond_prompt_embs, - null_prompt_emb, - ) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) self.scheduler.set_timesteps(denoising_steps, device=self.device) timesteps = self.scheduler.timesteps - z = z * self.scheduler.init_noise_sigma + latent_noise = latent_noise * self.scheduler.init_noise_sigma num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order - ref_start_idx = round(len(timesteps) * feedback_start) - ref_end_idx = round(len(timesteps) * feedback_end) + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) with tqdm(total=denoising_steps) as pbar: for i, t in enumerate(timesteps): - if hasattr(self.scheduler, "sigma_t"): - sigma = self.scheduler.sigma_t[t] - elif hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 + if hasattr(self.scheduler, "sigmas"): sigma = self.scheduler.sigmas[i] - else: - sigma = 0 + alpha_hat = 1 / (sigma**2 + 1) - z_single = self.scheduler.scale_model_input(z, t) + z_single = self.scheduler.scale_model_input(latent_noise, t) z_all = torch.cat([z_single] * 2, dim=0) z_ref = torch.cat([pos_latents, neg_latents], dim=0) + weight_factor = self.get_current_weight_factor(i, denoising_step, ref_start_idx, + ref_end_idx, min_weight, max_weight) if i >= ref_start_idx and i <= ref_end_idx: weight = max_weight else: weight = min_weight - pos_ws = (weight, weight * pos_bottleneck_scale) - neg_ws = (weight * neg_scale, weight * neg_scale * neg_bottleneck_scale) + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) if z_ref.size(0) > 0 and weight > 0: noise = torch.randn_like(z_ref) From adebe8f25c931c05c162b45ff185c9cded4f3a86 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 01:52:06 +0530 Subject: [PATCH 10/98] almost done testing --- src/diffusers/pipelines/fabric/__init__.py | 22 +++ .../pipelines/fabric/pipeline_fabric.py | 137 +++++++++--------- 2 files changed, 88 insertions(+), 71 deletions(-) diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index db76db7bc0bf..ca8e828d2ae6 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -1,7 +1,29 @@ +from dataclasses import dataclass from ...utils import ( + BaseOutput, OptionalDependencyNotAvailable, is_torch_available, ) +from typing import Union, Optional, List +import numpy as np +import PIL + +@dataclass +class FabricPipelineOutput(BaseOutput): + """ + Output class for Stable Diffusion pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_content_detected (`Optional[List[bool]]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, or `None` if safety checking could not be performed. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] + nsfw_content_detected: Optional[List[bool]] try: if not is_torch_available(): diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index af02389c05eb..20da34bb1163 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -33,6 +33,7 @@ UNetMidBlock2DCrossAttn, get_down_block, ) +from . import FabricPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -143,7 +144,7 @@ def __init__( self, model_name: Optional[str] = None, stable_diffusion_version: str = "1.5", - scheduler: EulerAncestralDiscreteScheduler, + scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, torch_dtype=torch.float32 ): @@ -194,28 +195,30 @@ def to(self, device): return super().to(device) def initialize_prompts(self, prompts: List[str]): - prompt_tokens = self.tokenizer( - prompts, - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - return prompt_embd + # Breaking into individual prompts feels memory efficient + prompt_embed_list = [] + for prompt in prompts: + prompt_tokens = self.tokenizer( + prompt, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + attention_mask = prompt_tokens.attention_mask.to(self.device) if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ) else None + + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + prompt_embed_list.append(prompt_embd) + + return torch.cat(prompt_embed_list, dim=0) def get_unet_hidden_states(self, z_all, t, prompt_embd): cached_hidden_states = [] @@ -254,11 +257,31 @@ def unet_forward_with_cached_hidden_states( return self.unet(z_all, t, encoder_hidden_states=prompt_embd) local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1 - )[:-1].tolist() + *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1] local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1 - )[:-1].tolist() + *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + + def new_forward_caching(module, hidden_states, cached_hiddens, weight, is_positive): + cached_hs = cached_hiddens.pop(0).to( + hidden_states.device + ) + cond_hs = torch.cat( + [hidden_states, cached_hs], dim=1 + ) + weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), + weight, device=hidden_states.device) + weights[:, hidden_states.size(1):] = 1.0 + out = attn_with_weights( + self, + hidden_states, + encoder_hidden_states=cond_hs, + weights=weights, + ) + return out + for block, pos_weight, neg_weight in zip( self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, @@ -283,45 +306,19 @@ def new_forward( batch_size, d_model, device=device, dtype=dtype ) + out_pos = self.old_forward(hidden_states) + out_neg = self.old_forward(hidden_states) + if cached_pos_hiddens is not None: - cached_pos_hs = cached_pos_hiddens.pop(0).to( - hidden_states.device - ) - cond_pos_hs = torch.cat( - [cond_hiddens, cached_pos_hs], dim=1 - ) - pos_weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model - ) - pos_weights[:, d_model:] = pos_weight - out_pos = attn_with_weights( - self, - cond_hiddens, - encoder_hidden_states=cond_pos_hs, - weights=pos_weights, - ) - else: - out_pos = self.old_forward(cond_hiddens) + out_pos = new_forward_caching( + self, hidden_states, cached_pos_hiddens, + pos_weight, is_positive=True) + if cached_neg_hiddens is not None: - cached_neg_hs = cached_neg_hiddens.pop(0).to( - hidden_states.device - ) - uncond_neg_hs = torch.cat( - [uncond_hiddens, cached_neg_hs], dim=1 - ) - neg_weights = weights.clone().repeat( - 1, 1 + cached_neg_hs.shape[1] // d_model - ) - neg_weights[:, d_model:] = neg_weight - out_neg = attn_with_weights( - self, - uncond_hiddens, - encoder_hidden_states=uncond_neg_hs, - weights=neg_weights, - ) - else: - out_neg = self.old_forward(uncond_hiddens) + out_neg = new_forward_caching( + self, hidden_states, cached_neg_hiddens, + neg_weight, is_positive=False) out = torch.cat([out_pos, out_neg], dim=0) return out @@ -418,17 +415,15 @@ def __call__( z_all = torch.cat([z_single] * 2, dim=0) z_ref = torch.cat([pos_latents, neg_latents], dim=0) - weight_factor = self.get_current_weight_factor(i, denoising_step, ref_start_idx, - ref_end_idx, min_weight, max_weight) if i >= ref_start_idx and i <= ref_end_idx: - weight = max_weight + weight_factor = max_weight else: - weight = min_weight + weight_factor = min_weight pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - if z_ref.size(0) > 0 and weight > 0: + if z_ref.size(0) > 0 and weight_factor > 0: noise = torch.randn_like(z_ref) if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): z_ref_noised = ( @@ -437,13 +432,13 @@ def __call__( else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - ref_prompt_embd = torch.cat([null_prompt_emb] * (pos_latents.size(0) + neg_latents.size(0)), dim=0) + ref_prompt_embd = torch.cat([null_prompt_emb] * (len(posotive_latents) + len(negative_latents)), dim=0) cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd ) - n_pos, n_neg = pos_latents.shape[0], neg_latents.shape[0] + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] cached_pos_hs, cached_neg_hs = [], [] for hs in cached_hidden_states: cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) @@ -486,7 +481,7 @@ def __call__( y = self.pipeline.decode_latents(z) imgs = self.pipeline.numpy_to_pil(y) - return imgs + return FabricPipelineOutpur(imgs) @staticmethod def image_to_tensor(image: Union[str, Image.Image]): From a31699f6b868ce05eeea4acb18c1bca5aed5686e Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 02:02:59 +0530 Subject: [PATCH 11/98] var changes --- .../pipelines/fabric/pipeline_fabric.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 20da34bb1163..dffecc13962e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -17,24 +17,19 @@ import torch from torch import nn from torch.nn import functional as F +from PIL import Image +import numpy as np from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging -from ...pipelines import DissusionPipeline from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock from ..stable_diffusion import StableDiffusionPipeline -from ...schedulers import EulerAncestralDiscreateScheduler -from .embeddings import TimestepEmbedding, Timesteps -from .modeling_utils import ModelMixin -from .unet_2d_blocks import ( - CrossAttnDownBlock2D, - DownBlock2D, - UNetMidBlock2DCrossAttn, - get_down_block, -) +from ...schedulers import EulerAncestralDiscreteScheduler from . import FabricPipelineOutput +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -139,7 +134,7 @@ def attn_with_weights( return hidden_states -class Fabric(DiffusionPipeline): +class FabricPipeline(DiffusionPipeline): def __init__( self, model_name: Optional[str] = None, From b94cbf409be1b7535772c093fcb3e9abfa6f5cf3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 02:46:29 +0530 Subject: [PATCH 12/98] testing --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index dffecc13962e..efa84fd2e28e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -366,7 +366,7 @@ def __call__( """ if random_seed is not None: - torch.manual_seed(seed) + torch.manual_seed(random_seed) latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) From 2999f40bcac18434b6e99ec922841ef1456a6ecb Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 11:44:33 +0530 Subject: [PATCH 13/98] device --- .../pipelines/fabric/pipeline_fabric.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index efa84fd2e28e..83e97b5284ad 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -19,6 +19,7 @@ from torch.nn import functional as F from PIL import Image import numpy as np +from tqdm import tqdm from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging @@ -181,13 +182,13 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - @property - def device(self): - return next(self.parameters()).device + #@property + #def device(self): + # return next(self.parameters()).device - def to(self, device): - self.pipeline.to(device) - return super().to(device) + #def to(self, device): + # self.pipeline.to(device) + # return super().to(device) def initialize_prompts(self, prompts: List[str]): # Breaking into individual prompts feels memory efficient @@ -331,9 +332,9 @@ def new_forward( return out - def preprocess_feedback_images(images, vae) -> torch.tensor: + def preprocess_feedback_images(images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] - images_t = torch.stack(images_t).to(self.device, dtype=self.dtype) + images_t = torch.stack(images_t).to(device, dtype=self.dtype) latents = ( vae.config.scaling_factor * vae.encode(iamges_t).latent_dist.sample() @@ -344,10 +345,10 @@ def preprocess_feedback_images(images, vae) -> torch.tensor: def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, - liked: Optional[List[Image.Image]] = None, - disliked: Optional[List[Image.Image]] = None, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "", + liked: Optional[List[Image.Image]] = [], + disliked: Optional[List[Image.Image]] = [], random_seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -364,15 +365,16 @@ def __call__( Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if random_seed is not None: torch.manual_seed(random_seed) + + device = self._execution_device - latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) - positive_letents = self.preprocess_feedback_images(liked,self.vae) if liked and len(liked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device, dtype=self.dtype) - negative_letents = self.preprocess_feedback_images(disliked,self.vae) if disliked and len(disliked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -388,7 +390,7 @@ def __call__( batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) - self.scheduler.set_timesteps(denoising_steps, device=self.device) + self.scheduler.set_timesteps(denoising_steps, device=device) timesteps = self.scheduler.timesteps latent_noise = latent_noise * self.scheduler.init_noise_sigma @@ -408,7 +410,7 @@ def __call__( z_single = self.scheduler.scale_model_input(latent_noise, t) z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([pos_latents, neg_latents], dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) if i >= ref_start_idx and i <= ref_end_idx: weight_factor = max_weight @@ -466,14 +468,14 @@ def __call__( noise_cond, noise_uncond = unet_out.chunk(2) guidance = noise_cond - noise_uncond noise_pred = noise_uncond + guidance_scale * guidance - z = self.scheduler.step(noise_pred, t, z).prev_sample + latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample if i == len(timesteps) - 1 or ( (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 ): pbar.update() - y = self.pipeline.decode_latents(z) + y = self.pipeline.decode_latents(latent_noise) imgs = self.pipeline.numpy_to_pil(y) return FabricPipelineOutpur(imgs) @@ -491,3 +493,4 @@ def image_to_tensor(image: Union[str, Image.Image]): image = np.array(image).astype(np.uint8) image = (image / 127.5 - 1.0).astype(np.float32) return torch.from_numpy(image).permute(2, 0, 1) + From 834ab800ef08d5b4cebce4fc644bcdc0fafa8fc3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 12:48:27 +0530 Subject: [PATCH 14/98] device support --- .../pipelines/fabric/pipeline_fabric.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 83e97b5284ad..dab5bd9cf6b5 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -142,7 +142,7 @@ def __init__( stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, - torch_dtype=torch.float32 + torch_dtype = None, ): super().__init__() @@ -174,13 +174,20 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) + if self._execution_device is not "cuda": + torch_dtype = torch.float32 + else: + torch_dtype = torch_dtype if torch_dtype else torch.float16 + self.pipeline = pipe self.unet = pipe.unet self.vae = pipe.vae self.text_encoder = pipe.text_encoder self.tokenizer = pipe.tokenizer self.scheduler = scheduler + self.dtype = torch_dtype + self.device = self._execution_device #@property #def device(self): @@ -190,7 +197,7 @@ def __init__( # self.pipeline.to(device) # return super().to(device) - def initialize_prompts(self, prompts: List[str]): + def initialize_prompts(self, prompts: List[str], device): # Breaking into individual prompts feels memory efficient prompt_embed_list = [] for prompt in prompts: @@ -202,13 +209,13 @@ def initialize_prompts(self, prompts: List[str]): truncation=True, ) - attention_mask = prompt_tokens.attention_mask.to(self.device) if ( + attention_mask = prompt_tokens.attention_mask.to(device) if ( hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask ) else None prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(self.device), + input_ids=prompt_tokens.input_ids.to(device), attention_mask=attention_mask, ).last_hidden_state @@ -386,7 +393,7 @@ def __call__( assert len(negative_prompt) == n_images - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) From 39ea4f9f973e529404193297f95e159d5a7da7ed Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 12:54:07 +0530 Subject: [PATCH 15/98] maybe --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index dab5bd9cf6b5..4bd8b00d6095 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -159,6 +159,11 @@ def __init__( f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" ) + if self._execution_device is not "cuda": + torch_dtype = torch.float32 + else: + torch_dtype = torch_dtype if torch_dtype else torch.float16 + scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( @@ -174,10 +179,6 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - if self._execution_device is not "cuda": - torch_dtype = torch.float32 - else: - torch_dtype = torch_dtype if torch_dtype else torch.float16 self.pipeline = pipe self.unet = pipe.unet From fb59e24d5195575da9a1743d741a7313538394e4 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 13:19:48 +0530 Subject: [PATCH 16/98] device malfunctions --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 4bd8b00d6095..04115ba1b752 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -188,7 +188,6 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - self.device = self._execution_device #@property #def device(self): From 1ac8004e9d42dd68861db757d4eae63614e21061 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:02:27 +0530 Subject: [PATCH 17/98] new new --- .../pipelines/fabric/pipeline_fabric.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 04115ba1b752..837c6a270323 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -246,6 +246,34 @@ def new_forward(self, hidden_states, *args, **kwargs): return cached_hidden_states + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + def unet_forward_with_cached_hidden_states( self, z_all, From 4b3f1a6ce0a31c45990ebe69b9d73b1eb423effa Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:13:20 +0530 Subject: [PATCH 18/98] register --- .../pipelines/fabric/pipeline_fabric.py | 41 ++++--------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 837c6a270323..bdd7e422f8fd 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -159,11 +159,6 @@ def __init__( f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" ) - if self._execution_device is not "cuda": - torch_dtype = torch.float32 - else: - torch_dtype = torch_dtype if torch_dtype else torch.float16 - scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( @@ -179,6 +174,14 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) + + self.register_modules( + vae=pipe.vae, + text_encoder=pipe.text_encoder, + tokenizer=pipe.tokenizer, + unet=pipe.unet, + scheduler=scheduler, + ) self.pipeline = pipe self.unet = pipe.unet @@ -246,34 +249,6 @@ def new_forward(self, hidden_states, *args, **kwargs): return cached_hidden_states - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - def unet_forward_with_cached_hidden_states( self, z_all, From ad988f2b3f5bdaedff59ff2c9936387cf668f36e Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:41:37 +0530 Subject: [PATCH 19/98] testing --- .../pipelines/fabric/pipeline_fabric.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index bdd7e422f8fd..737b77f1b388 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -21,6 +21,22 @@ import numpy as np from tqdm import tqdm +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging from ...models.cross_attention import LoRACrossAttnProcessor @@ -138,7 +154,17 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - model_name: Optional[str] = None, + #model_name: Optional[str] = None, + + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + #scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, @@ -174,20 +200,13 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - - self.register_modules( - vae=pipe.vae, - text_encoder=pipe.text_encoder, - tokenizer=pipe.tokenizer, - unet=pipe.unet, - scheduler=scheduler, - ) + self.register_modules(unet=unet, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler) self.pipeline = pipe - self.unet = pipe.unet - self.vae = pipe.vae - self.text_encoder = pipe.text_encoder - self.tokenizer = pipe.tokenizer + self.unet = unet + self.vae = vae + self.text_encoder = text_encoder + self.tokenizer = tokenizer self.scheduler = scheduler self.dtype = torch_dtype From a1cf609e9f69c0e58f1277b83ce083e68a7854b3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 19:00:39 +0530 Subject: [PATCH 20/98] exec does not work --- .../pipelines/fabric/pipeline_fabric.py | 29 ++++++------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 737b77f1b388..65c4d76176c4 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -154,17 +154,7 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - #model_name: Optional[str] = None, - - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - #scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool = True, - + model_name: Optional[str] = None, stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, @@ -192,7 +182,7 @@ def __init__( scheduler=scheduler, torch_dtype=torch_dtype, safety_checker=None, - ) + ).to("cuda") if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -200,13 +190,11 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - self.register_modules(unet=unet, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler) - self.pipeline = pipe - self.unet = unet - self.vae = vae - self.text_encoder = text_encoder - self.tokenizer = tokenizer + self.unet = pipe.unet + self.vae = pipe.vae + self.text_encoder = pipe.text_encoder + self.tokenizer = pipe.tokenizer self.scheduler = scheduler self.dtype = torch_dtype @@ -397,7 +385,7 @@ def __call__( if random_seed is not None: torch.manual_seed(random_seed) - device = self._execution_device + device = torch.device("cuda") latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) @@ -507,7 +495,7 @@ def __call__( y = self.pipeline.decode_latents(latent_noise) imgs = self.pipeline.numpy_to_pil(y) - return FabricPipelineOutpur(imgs) + return FabricPipelineOutput(imgs,False) @staticmethod def image_to_tensor(image: Union[str, Image.Image]): @@ -523,3 +511,4 @@ def image_to_tensor(image: Union[str, Image.Image]): image = (image / 127.5 - 1.0).astype(np.float32) return torch.from_numpy(image).permute(2, 0, 1) + From 27bb344b9538f6ecc854b95ad832c2c696a452b3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 20:17:12 +0530 Subject: [PATCH 21/98] float --- .../pipelines/fabric/pipeline_fabric.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 65c4d76176c4..c499485640a6 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -270,19 +270,19 @@ def unet_forward_with_cached_hidden_states( return self.unet(z_all, t, encoder_hidden_states=prompt_embd) local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cached_hiddens, weight, is_positive): + def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, d_model, weight, is_positive): cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) cond_hs = torch.cat( - [hidden_states, cached_hs], dim=1 + [cond_hiddens, cached_hs], dim=1 ) - weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model + weights = weight.clone().repeat( + 1, 1 + cached_hs.shape[1] // d_model ) weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), weight, device=hidden_states.device) @@ -324,13 +324,13 @@ def new_forward( if cached_pos_hiddens is not None: out_pos = new_forward_caching( - self, hidden_states, cached_pos_hiddens, + self, hidden_states, cond_hiddens, cached_pos_hiddens, d_model, pos_weight, is_positive=True) if cached_neg_hiddens is not None: out_neg = new_forward_caching( - self, hidden_states, cached_neg_hiddens, + self, hidden_states, uncond_hiddens, cached_neg_hiddens, d_model, neg_weight, is_positive=False) out = torch.cat([out_pos, out_neg], dim=0) @@ -349,12 +349,12 @@ def new_forward( return out - def preprocess_feedback_images(images, vae, device) -> torch.tensor: + def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] images_t = torch.stack(images_t).to(device, dtype=self.dtype) latents = ( vae.config.scaling_factor - * vae.encode(iamges_t).latent_dist.sample() + * vae.encode(images_t).latent_dist.sample() ) return latents @@ -446,7 +446,7 @@ def __call__( else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - ref_prompt_embd = torch.cat([null_prompt_emb] * (len(posotive_latents) + len(negative_latents)), dim=0) + ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd @@ -512,3 +512,4 @@ def image_to_tensor(image: Union[str, Image.Image]): return torch.from_numpy(image).permute(2, 0, 1) + From b827bf347dfee38baf2d3ff592049e5d352075e1 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 01:22:25 +0530 Subject: [PATCH 22/98] change info --- .../pipelines/fabric/pipeline_fabric.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index c499485640a6..c3fee2f60bca 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -274,22 +274,20 @@ def unet_forward_with_cached_hidden_states( local_neg_weights = torch.linspace( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, d_model, weight, is_positive): + def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) cond_hs = torch.cat( [cond_hiddens, cached_hs], dim=1 ) - weights = weight.clone().repeat( - 1, 1 + cached_hs.shape[1] // d_model + weights = weights.clone().repeat( + 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) - weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), - weight, device=hidden_states.device) - weights[:, hidden_states.size(1):] = 1.0 + weights[:, hidden_states.size(1):] = weight out = attn_with_weights( self, - hidden_states, + cond_hs, encoder_hidden_states=cond_hs, weights=weights, ) @@ -324,14 +322,14 @@ def new_forward( if cached_pos_hiddens is not None: out_pos = new_forward_caching( - self, hidden_states, cond_hiddens, cached_pos_hiddens, d_model, - pos_weight, is_positive=True) + self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, + weights) if cached_neg_hiddens is not None: out_neg = new_forward_caching( - self, hidden_states, uncond_hiddens, cached_neg_hiddens, d_model, - neg_weight, is_positive=False) + self, hidden_states, uncond_hiddens, cached_neg_hiddens, + neg_weight, weights) out = torch.cat([out_pos, out_neg], dim=0) return out From 4aa7aaa123ec445393fd7c9834d2370568e0b9e9 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 21:15:55 +0530 Subject: [PATCH 23/98] change of architecture --- .../pipelines/fabric/pipeline_fabric.py | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index c3fee2f60bca..8a4f4dacf6b1 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,3 +1,5 @@ +### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that +### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -154,35 +156,36 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - model_name: Optional[str] = None, - stable_diffusion_version: str = "1.5", - scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, - torch_dtype = None, ): super().__init__() - if stable_diffusion_version == "2.1": - warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") + #if stable_diffusion_version == "2.1": + # warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") - if model_name is None: - if stable_diffusion_version == "1.5": - model_name = "runwayml/stable-diffusion-v1-5" - elif stable_diffusion_version == "2.1": - model_name = "stabilityai/stable-diffusion-2-1" - else: - raise ValueError( - f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" - ) + #if model_name is None: + # if stable_diffusion_version == "1.5": + # model_name = "runwayml/stable-diffusion-v1-5" + # elif stable_diffusion_version == "2.1": + # model_name = "stabilityai/stable-diffusion-2-1" + # else: + # raise ValueError( + # f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" + # ) - scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") + #scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - pipe = StableDiffusionPipeline.from_pretrained( - model_name, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ).to("cuda") + # pipe = StableDiffusionPipeline.from_pretrained( + # model_name, + # scheduler=scheduler, + # torch_dtype=torch_dtype, + # safety_checker=None, + # ).to("cuda") if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -190,14 +193,13 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - self.pipeline = pipe - self.unet = pipe.unet - self.vae = pipe.vae - self.text_encoder = pipe.text_encoder - self.tokenizer = pipe.tokenizer - self.scheduler = scheduler - - self.dtype = torch_dtype + self.register_modules( + unet = unet, + vae = vae, + text_encoder = text_encoder, + tokenizer = tokenizer, + scheduler = scheduler, + ) #@property #def device(self): @@ -224,6 +226,8 @@ def initialize_prompts(self, prompts: List[str], device): and self.text_encoder.config.use_attention_mask ) else None + print("Asdfsdf",attention_mask) + prompt_embd = self.text_encoder( input_ids=prompt_tokens.input_ids.to(device), attention_mask=attention_mask, @@ -287,7 +291,7 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei weights[:, hidden_states.size(1):] = weight out = attn_with_weights( self, - cond_hs, + cond_hiddens, encoder_hidden_states=cond_hs, weights=weights, ) @@ -325,7 +329,6 @@ def new_forward( self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, weights) - if cached_neg_hiddens is not None: out_neg = new_forward_caching( self, hidden_states, uncond_hiddens, cached_neg_hiddens, @@ -349,7 +352,7 @@ def new_forward( def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] - images_t = torch.stack(images_t).to(device, dtype=self.dtype) + images_t = torch.stack(images_t).to(device) latents = ( vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() @@ -383,13 +386,13 @@ def __call__( if random_seed is not None: torch.manual_seed(random_seed) - device = torch.device("cuda") + device = self._execution_device - latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device) - positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device, dtype=self.dtype) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=self.dtype) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device) if isinstance(prompt, str): prompt = [prompt] * n_images From 6e7ab4e7d78f0d815d24e982482995e490743eb7 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 21:22:18 +0530 Subject: [PATCH 24/98] might work --- .../pipelines/fabric/pipeline_fabric.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 8a4f4dacf6b1..cc0c63d68c9d 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -200,6 +200,8 @@ def __init__( tokenizer = tokenizer, scheduler = scheduler, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) #@property #def device(self): @@ -359,6 +361,19 @@ def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: ) return latents + def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Plea se" + " use VaeImageProcessor instead", + FutureWarning, + ) + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + @torch.no_grad() def __call__( @@ -493,8 +508,8 @@ def __call__( ): pbar.update() - y = self.pipeline.decode_latents(latent_noise) - imgs = self.pipeline.numpy_to_pil(y) + y = self.decode_latents(latent_noise) + imgs = self.image_processor.numpy_to_pil(y) return FabricPipelineOutput(imgs,False) From f9635d714ced61add72776451c610b40e17d5371 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 23:13:35 +0530 Subject: [PATCH 25/98] testing with colab --- .../pipelines/fabric/pipeline_fabric.py | 144 +++++++++--------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index cc0c63d68c9d..ecec00e64759 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -21,6 +21,7 @@ from torch.nn import functional as F from PIL import Image import numpy as np +import warnings from tqdm import tqdm from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer @@ -85,72 +86,77 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -def attn_with_weights( - attn: nn.Module, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - weights=None, # shape: (batch_size, sequence_length) - lora_scale=1.0, -): - batch_size, sequence_length, _ = ( - hidden_states.shape - if encoder_hidden_states is None - else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size - ) +class CrossAttnStoreProcessor() + def __init__(self): + self.attntion_probs = None - if isinstance(attn.processor, LoRACrossAttnProcessor): - query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( - hidden_states + def __call__( + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + weights=None, # shape: (batch_size, sequence_length) + lora_scale=1.0, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size ) - else: - query = attn.to_q(hidden_states) - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + if isinstance(attn.processor, LoRACrossAttnProcessor): + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( + hidden_states + ) + else: + query = attn.to_q(hidden_states) - if isinstance(attn.processor, LoRACrossAttnProcessor): - key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( - encoder_hidden_states - ) - value = attn.to_v( - encoder_hidden_states - ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) - else: - key = attn.to_k(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) + if isinstance(attn.processor, LoRACrossAttnProcessor): + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( + encoder_hidden_states + ) + value = attn.to_v( + encoder_hidden_states + ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) - attention_probs = attn.get_attention_scores(query, key, attention_mask) + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) - if weights is not None: - if weights.shape[0] != 1: - weights = weights.repeat_interleave(attn.heads, dim=0) - attention_probs = attention_probs * weights[:, None] - attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + attention_probs = attn.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) + if weights is not None: + if weights.shape[0] != 1: + weights = weights.repeat_interleave(attn.heads, dim=0) + attention_probs = attention_probs * weights[:, None] + attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) - # linear proj - if isinstance(attn.processor, LoRACrossAttnProcessor): - hidden_states = attn.to_out[0]( - hidden_states - ) + lora_scale * attn.processor.to_out_lora(hidden_states) - else: - hidden_states = attn.to_out[0](hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + if isinstance(attn.processor, LoRACrossAttnProcessor): + hidden_states = attn.to_out[0]( + hidden_states + ) + lora_scale * attn.processor.to_out_lora(hidden_states) + else: + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) - return hidden_states + return hidden_states class FabricPipeline(DiffusionPipeline): @@ -228,7 +234,6 @@ def initialize_prompts(self, prompts: List[str], device): and self.text_encoder.config.use_attention_mask ) else None - print("Asdfsdf",attention_mask) prompt_embd = self.text_encoder( input_ids=prompt_tokens.input_ids.to(device), @@ -291,6 +296,7 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) weights[:, hidden_states.size(1):] = weight + attn_with_weights = CrossAttnStoreProcessor() out = attn_with_weights( self, cond_hiddens, @@ -352,8 +358,8 @@ def new_forward( return out - def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: - images_t = [self.image_to_tensor(img) for img in images] + def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: + images_t = [self.image_to_tensor(img,dtype) for img in images] images_t = torch.stack(images_t).to(device) latents = ( vae.config.scaling_factor @@ -362,7 +368,7 @@ def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: return latents def decode_latents(self, latents): - warnings.warn( + warnings.warn( "The decode_latents method is deprecated and will be removed in a future version. Plea se" " use VaeImageProcessor instead", FutureWarning, @@ -402,12 +408,13 @@ def __call__( torch.manual_seed(random_seed) device = self._execution_device + dtype = self.text_encoder.dtype - latent_noise = torch.randn(n_images, 4, 64, 64, device=device) - - positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) + + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, ) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -458,12 +465,12 @@ def __call__( if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): z_ref_noised = ( alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise - ) + ).type(dtype) + print("here") else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) - cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd ) @@ -514,7 +521,7 @@ def __call__( return FabricPipelineOutput(imgs,False) @staticmethod - def image_to_tensor(image: Union[str, Image.Image]): + def image_to_tensor(image: Union[str, Image.Image], dtype): """ Convert a PIL image to a torch tensor. """ @@ -525,7 +532,8 @@ def image_to_tensor(image: Union[str, Image.Image]): image = image.resize((512, 512)) image = np.array(image).astype(np.uint8) image = (image / 127.5 - 1.0).astype(np.float32) - return torch.from_numpy(image).permute(2, 0, 1) + image = torch.from_numpy(image).permute(2, 0, 1) + return image.type(dtype) From cda85ed66c1928b61779897a19c73081028816a9 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 13 Aug 2023 00:53:41 +0530 Subject: [PATCH 26/98] more attn atuff --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ecec00e64759..e0e9773c9cb2 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -86,7 +86,7 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -class CrossAttnStoreProcessor() +class CrossAttnStoreProcessor(): def __init__(self): self.attntion_probs = None @@ -158,7 +158,6 @@ def __call__( return hidden_states - class FabricPipeline(DiffusionPipeline): def __init__( self, @@ -286,6 +285,7 @@ def unet_forward_with_cached_hidden_states( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): + cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) @@ -296,12 +296,12 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) weights[:, hidden_states.size(1):] = weight + print(self) attn_with_weights = CrossAttnStoreProcessor() - out = attn_with_weights( - self, + self.unet.set_attn_processor(attn_with_weights) + out = self.unet( cond_hiddens, encoder_hidden_states=cond_hs, - weights=weights, ) return out @@ -537,3 +537,4 @@ def image_to_tensor(image: Union[str, Image.Image], dtype): + From 9eb64591990e0d8b22086dbfc4e440c4a6450046 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 15 Aug 2023 23:01:00 +0530 Subject: [PATCH 27/98] stupid additions --- .../pipelines/fabric/pipeline_fabric.py | 88 +++++++++++++------ src/test.py | 17 ++++ 2 files changed, 78 insertions(+), 27 deletions(-) create mode 100644 src/test.py diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index e0e9773c9cb2..47f3efede0c1 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -86,7 +86,7 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -class CrossAttnStoreProcessor(): +class CrossAttnProcessor(): def __init__(self): self.attntion_probs = None @@ -284,26 +284,6 @@ def unet_forward_with_cached_hidden_states( local_neg_weights = torch.linspace( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): - - cached_hs = cached_hiddens.pop(0).to( - hidden_states.device - ) - cond_hs = torch.cat( - [cond_hiddens, cached_hs], dim=1 - ) - weights = weights.clone().repeat( - 1, 1 + cached_hs.shape[1] // hidden_states.size(1) - ) - weights[:, hidden_states.size(1):] = weight - print(self) - attn_with_weights = CrossAttnStoreProcessor() - self.unet.set_attn_processor(attn_with_weights) - out = self.unet( - cond_hiddens, - encoder_hidden_states=cond_hs, - ) - return out for block, pos_weight, neg_weight in zip( @@ -332,15 +312,69 @@ def new_forward( out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) + def new_forward_caching(self, hidden_states, cond_hiddens, cached_hiddens, weight, weights): + + cached_hs = cached_hiddens.pop(0).to( + hidden_states.device + ) + cond_hs = torch.cat( + [cond_hiddens, cached_hs], dim=1 + ) + weights = weights.clone().repeat( + 1, 1 + cached_hs.shape[1] // hidden_states.size(1) + ) + weights[:, hidden_states.size(1):] = weight + print(self) + attn_with_weights = CrossAttnStoreProcessor() + out = self.attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_hs, + weights=weights + ) + return out + if cached_pos_hiddens is not None: - out_pos = new_forward_caching( - self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, - weights) + cached_pos_hs = cached_pos_hiddens.pop(0).to( + hidden_states.device + ) + cond_pos_hs = torch.cat( + [cond_hiddens, cached_pos_hs], dim=1 + ) + pos_weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + pos_weights[:, d_model:] = pos_weight + attn_with_weights = CrossAttnProcessor() + out_pos = attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_pos_hs, + weights=pos_weights, + ) + else: + out_pos = self.old_forward(cond_hiddens) if cached_neg_hiddens is not None: - out_neg = new_forward_caching( - self, hidden_states, uncond_hiddens, cached_neg_hiddens, - neg_weight, weights) + cached_neg_hs = cached_neg_hiddens.pop(0).to( + hidden_states.device + ) + uncond_neg_hs = torch.cat( + [uncond_hiddens, cached_neg_hs], dim=1 + ) + neg_weights = weights.clone().repeat( + 1, 1 + cached_neg_hs.shape[1] // d_model + ) + neg_weights[:, d_model:] = neg_weight + attn_with_weights = CrossAttnProcessor() + out_neg = attn_with_weights( + self, + uncond_hiddens, + encoder_hidden_states=uncond_neg_hs, + weights=neg_weights, + ) + else: + out_neg = self.old_forward(uncond_hiddens) out = torch.cat([out_pos, out_neg], dim=0) return out diff --git a/src/test.py b/src/test.py new file mode 100644 index 000000000000..67943eaec8e4 --- /dev/null +++ b/src/test.py @@ -0,0 +1,17 @@ +from diffusers import FabricPipeline +import torch + + +model_id = "dreamlike-art/dreamlike-photoreal-2.0" +pipe = FabricPipeline.from_pretrained(model_id,torch_dtype=torch.float32) +#pipe = pipe.to("cuda") +prompt = "photo, naked women fingering in her ass, no cloths, big boobs" +neg_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality" +liked = ["../../transformers/src/test.jpg"] +disliked = ["../../transformers/src/test.jpg"] +image = pipe(prompt, negative_prompt = neg_prompt, liked=liked, disliked=disliked) +for i, im in enumerate(image.images): + im.save(f"{time.time()}_{i}.jpg") + + + From ef34c1af5f0c3bf2503e5f64369dda53272b4dbc Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 15 Aug 2023 23:51:57 +0530 Subject: [PATCH 28/98] documenting and testing --- .../pipelines/fabric/pipeline_fabric.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 47f3efede0c1..9095c72616fc 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -419,17 +419,17 @@ def decode_latents(self, latents): def __call__( self, prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", liked: Optional[List[Image.Image]] = [], disliked: Optional[List[Image.Image]] = [], - random_seed: int = 42, - n_images: int = 1, - guidance_scale: float = 8.0, + random_seed: int = 37, + n_images: int = 4, + guidance_scale: float = 7.0, denoising_steps: int = 20, feedback_start_ratio: float = 0.33, feedback_end_ratio: float = 0.66, - min_weight: float = 0.1, - max_weight: float = 1.0, + min_weight: float = 0.05, + max_weight: float = .8, neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, @@ -446,9 +446,9 @@ def __call__( latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) - positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, ) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) if isinstance(prompt, str): prompt = [prompt] * n_images From f0efafa431f1178c5790e30f64ffaa5d88e906b7 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 00:11:52 +0530 Subject: [PATCH 29/98] writing tests --- tests/pipelines/fabric/__init__.py | 0 tests/pipelines/fabric/test_fabric.py | 147 ++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 tests/pipelines/fabric/__init__.py create mode 100644 tests/pipelines/fabric/test_fabric.py diff --git a/tests/pipelines/fabric/__init__.py b/tests/pipelines/fabric/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py new file mode 100644 index 000000000000..06d9f53b6b6b --- /dev/null +++ b/tests/pipelines/fabric/test_fabric.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch + +from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel +from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..pipeline_params import ( + CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, + CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = DiTPipeline + params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - { + "latents", + "num_images_per_prompt", + "callback", + "callback_steps", + } + batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + transformer = Transformer2DModel( + sample_size=16, + num_layers=2, + patch_size=4, + attention_head_dim=8, + num_attention_heads=2, + in_channels=4, + out_channels=8, + attention_bias=True, + activation_fn="gelu-approximate", + num_embeds_ada_norm=1000, + norm_type="ada_norm_zero", + norm_elementwise_affine=False, + ) + vae = AutoencoderKL() + scheduler = DDIMScheduler() + components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler} + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "class_labels": [1], + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + self.assertEqual(image.shape, (1, 16, 16, 3)) + expected_slice = np.array([0.2946, 0.6601, 0.4329, 0.3296, 0.4144, 0.5319, 0.7273, 0.5013, 0.4457]) + max_diff = np.abs(image_slice.flatten() - expected_slice).max() + self.assertLessEqual(max_diff, 1e-3) + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) + + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + + +@require_torch_gpu +@slow +class FABRICPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_fabric(self): + generator = torch.manual_seed(0) + + pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe.to("cuda") + + prompt = "white wolf holding an umbrella" + + images = pipe(prompt, random_seed=generator).images + + for word, image in zip(prompt, images): + expected_image = load_numpy( + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + ) + assert np.abs((expected_image - image).max()) < 1e-2 + + def test_fabric_feedback(self): + generator = torch.manual_seed(0) + + pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe.to("cuda") + + prompt = "white wolf holding an umbrella" + + images = pipe(prompt, random_seed=generator).images + + for word, image in zip(prompt, images): + expected_image = load_numpy( + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + ) + assert np.abs((expected_image - image).max()) < 1e-2 From 88a5a9ac27e34ade4f74a070b0952609fabe83fa Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:49:44 +0530 Subject: [PATCH 30/98] more docs --- .../pipelines/fabric/pipeline_fabric.py | 160 +++++++++++++----- 1 file changed, 117 insertions(+), 43 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 9095c72616fc..7ff9213c55cb 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -26,11 +26,9 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( deprecate, is_accelerate_available, @@ -38,17 +36,16 @@ logging, randn_tensor, replace_example_docstring, + BaseOutput, ) from ...configuration_utils import ConfigMixin, register_to_config -from ...utils import BaseOutput, logging from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock -from ..stable_diffusion import StableDiffusionPipeline from ...schedulers import EulerAncestralDiscreteScheduler from . import FabricPipelineOutput -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -159,6 +156,31 @@ def __call__( return hidden_states class FabricPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion and conditioning the results + using feedback images. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`EulerAncestralDiscreteScheduler`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + """ def __init__( self, vae: AutoencoderKL, @@ -166,38 +188,58 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: EulerAncestralDiscreteScheduler, - lora_weights: Optional[str] = None, + safety_checker: StableDiffusionSafetyChecker, + requires_safety_checker:bool = True, ): super().__init__() + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want + to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_che + cker=None'` instead." + ) - #if stable_diffusion_version == "2.1": - # warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") - - #if model_name is None: - # if stable_diffusion_version == "1.5": - # model_name = "runwayml/stable-diffusion-v1-5" - # elif stable_diffusion_version == "2.1": - # model_name = "stabilityai/stable-diffusion-2-1" - # else: - # raise ValueError( - # f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" - # ) - - #scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - - # pipe = StableDiffusionPipeline.from_pretrained( - # model_name, - # scheduler=scheduler, - # torch_dtype=torch_dtype, - # safety_checker=None, - # ).to("cuda") - - if lora_weights: - print(f"Applying LoRA weights from {lora_weights}") - apply_unet_lora_weights( - pipeline=pipe, unet_path=lora_weights + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller t + han" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of an + y of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \ + n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/sta + ble-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 6 + 4 in the" + " configuration file. Please make sure to update the config accordingly as leaving + `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have do + wnloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a + Pull request for" + " the `unet/config.json` file" ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + self.register_modules( unet = unet, vae = vae, @@ -208,13 +250,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - #@property - #def device(self): - # return next(self.parameters()).device - - #def to(self, device): - # self.pipeline.to(device) - # return super().to(device) def initialize_prompts(self, prompts: List[str], device): # Breaking into individual prompts feels memory efficient @@ -420,8 +455,8 @@ def __call__( self, prompt: Optional[Union[str, List[str]]] = "", negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[List[Image.Image]] = [], - disliked: Optional[List[Image.Image]] = [], + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], random_seed: int = 37, n_images: int = 4, guidance_scale: float = 7.0, @@ -434,11 +469,50 @@ def __call__( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Liked enables feedback through images, encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Disliked enables feedback through images, discourages images with disliked features. + random_seed (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. + to make generation deterministic. + n_images (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + denoising_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ """ Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if random_seed is not None: + if random_seed is not None and random_seed is not torch.Generator: torch.manual_seed(random_seed) device = self._execution_device @@ -557,7 +631,7 @@ def __call__( @staticmethod def image_to_tensor(image: Union[str, Image.Image], dtype): """ - Convert a PIL image to a torch tensor. + Convert latent PIL image to a torch tensor for further processing. """ if isinstance(image, str): image = Image.open(image) From 9fa434f0627ed942a603a441c3a63d93ff41c431 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:50:56 +0530 Subject: [PATCH 31/98] tests and docs --- .../pipelines/fabric/pipeline_fabric.py | 33 ---- tests/pipelines/fabric/test_fabric.py | 163 ++++++++++++------ 2 files changed, 111 insertions(+), 85 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 7ff9213c55cb..afc11c70d194 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -50,39 +50,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -def apply_unet_lora_weights(pipeline, unet_path): - model_weight = torch.load(unet_path, map_location="cpu") - unet = pipeline.unet - lora_attn_procs = {} - lora_rank = list( - set([v.size(0) for k, v in model_weight.items() if k.endswith("down.weight")]) - ) - assert len(lora_rank) == 1 - lora_rank = lora_rank[0] - for name in unet.attn_processors.keys(): - cross_attention_dim = ( - None - if name.endswith("attn1.processor") - else unet.config.cross_attention_dim - ) - if name.startswith("mid_block"): - hidden_size = unet.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(unet.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = unet.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRACrossAttnProcessor( - hidden_size=hidden_size, - cross_attention_dim=cross_attention_dim, - rank=lora_rank, - ).to(pipeline.device) - unet.set_attn_processor(lora_attn_procs) - unet.load_state_dict(model_weight, strict=False) - - class CrossAttnProcessor(): def __init__(self): self.attntion_probs = None diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 06d9f53b6b6b..0783fa5b890d 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -33,36 +33,61 @@ enable_full_determinism() -class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DiTPipeline - params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents", - "num_images_per_prompt", - "callback", - "callback_steps", - } - batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS +class FabricPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): + pipeline_class = FabricDiffusionPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) - transformer = Transformer2DModel( - sample_size=16, - num_layers=2, - patch_size=4, - attention_head_dim=8, - num_attention_heads=2, + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, in_channels=4, - out_channels=8, - attention_bias=True, - activation_fn="gelu-approximate", - num_embeds_ada_norm=1000, - norm_type="ada_norm_zero", - norm_elementwise_affine=False, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, ) - vae = AutoencoderKL() - scheduler = DDIMScheduler() - components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler} + scheduler = EulerAncestralDiscreteScheduler() + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } return components def get_dummy_inputs(self, device, seed=0): @@ -71,39 +96,71 @@ def get_dummy_inputs(self, device, seed=0): else: generator = torch.Generator(device=device).manual_seed(seed) inputs = { - "class_labels": [1], - "generator": generator, - "num_inference_steps": 2, - "output_type": "numpy", + "prompt": "A painting of a squirrel eating a burger", + "random_ssed": generator, + "num_images": 1, } return inputs - def test_inference(self): - device = "cpu" + def test_stable_diffusion_ddim(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) + sd_pipe = FabricPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images + output = sd_pipe(**inputs) + image = output.images + image_slice = image[0, -3:, -3:, -1] - self.assertEqual(image.shape, (1, 16, 16, 3)) - expected_slice = np.array([0.2946, 0.6601, 0.4329, 0.3296, 0.4144, 0.5319, 0.7273, 0.5013, 0.4457]) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864]) - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + + def test_stable_diffusion_negative_prompt_embeds(self): + components = self.get_dummy_components() + sd_pipe = FabricPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + negative_prompt = 3 * ["this is a negative prompt"] + inputs["negative_prompt"] = negative_prompt + inputs["prompt"] = 3 * [inputs["prompt"]] + + # forward + output = sd_pipe(**inputs) + image_slice_1 = output.images[0, -3:, -3:, -1] + + inputs = self.get_dummy_inputs(torch_device) + prompt = 3 * [inputs.pop("prompt")] + + embeds = [] + for p in [prompt, negative_prompt]: + text_inputs = sd_pipe.tokenizer( + p, + padding="max_length", + max_length=sd_pipe.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_inputs = text_inputs["input_ids"].to(torch_device) + + embeds.append(sd_pipe.text_encoder(text_inputs)[0]) + + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds + + # forward + output = sd_pipe(**inputs) + image_slice_2 = output.images[0, -3:, -3:, -1] + + assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 @require_torch_gpu @@ -120,13 +177,13 @@ def test_fabric(self): pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") - prompt = "white wolf holding an umbrella" + prompt = "a photograph of an astronaut riding a horse" images = pipe(prompt, random_seed=generator).images for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" ) assert np.abs((expected_image - image).max()) < 1e-2 @@ -136,12 +193,14 @@ def test_fabric_feedback(self): pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") - prompt = "white wolf holding an umbrella" - + prompt = "a photograph of an astronaut riding a horse" images = pipe(prompt, random_seed=generator).images + liked = [images] + images = pipe(prompt, random_seed=generator, liked=liked).images + for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" ) assert np.abs((expected_image - image).max()) < 1e-2 From e3fc7cabc16331a771ae306406db861088c2225d Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:51:14 +0530 Subject: [PATCH 32/98] remove test --- src/test.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 src/test.py diff --git a/src/test.py b/src/test.py deleted file mode 100644 index 67943eaec8e4..000000000000 --- a/src/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from diffusers import FabricPipeline -import torch - - -model_id = "dreamlike-art/dreamlike-photoreal-2.0" -pipe = FabricPipeline.from_pretrained(model_id,torch_dtype=torch.float32) -#pipe = pipe.to("cuda") -prompt = "photo, naked women fingering in her ass, no cloths, big boobs" -neg_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality" -liked = ["../../transformers/src/test.jpg"] -disliked = ["../../transformers/src/test.jpg"] -image = pipe(prompt, negative_prompt = neg_prompt, liked=liked, disliked=disliked) -for i, im in enumerate(image.images): - im.save(f"{time.time()}_{i}.jpg") - - - From d78fb6b77b2d43762b66de43e2b1a848829394c9 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 22 Jul 2023 00:09:14 +0530 Subject: [PATCH 33/98] empty PR --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/diffusers/pipelines/fabric/pipeline_fabric.py diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py new file mode 100644 index 000000000000..e69de29bb2d1 From 2fc96429f2f4ed94a899d06f2404f8feca6cb274 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 30 Jul 2023 11:27:09 +0530 Subject: [PATCH 34/98] init --- src/diffusers/pipelines/fabric/__init__.py | 23 +++++++++++++++++++ .../pipelines/fabric/pipeline_fabric.py | 15 ++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/diffusers/pipelines/fabric/__init__.py diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py new file mode 100644 index 000000000000..d5f7eb6b4fcc --- /dev/null +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -0,0 +1,23 @@ +from ...utils import ( + OptionalDependencyNotAvailable, + is_flax_available, + is_torch_available, + is_transformers_available, +) + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 +else: + from .multicontrolnet import MultiControlNetModel + from .pipeline_controlnet import StableDiffusionControlNetPipeline + from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline + from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline + from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline + + +if is_transformers_available() and is_flax_available(): + from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index e69de29bb2d1..0bae7300e5f3 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -0,0 +1,15 @@ +import warnings +from typing import List, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from PIL import Image +from tqdm import tqdm +from diffusers import ( + StableDiffusionPipeline, + EulerAncestralDiscreteScheduler, +) +from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.cross_attention import LoRACrossAttnProcessor From fb525bf70d14959cdc07057f16d275f2ed21bf16 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 1 Aug 2023 16:48:59 +0530 Subject: [PATCH 35/98] changes --- src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 2 ++ src/diffusers/pipelines/fabric/__init__.py | 12 +----------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 9080fed4b81b..51cd863e898a 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -135,6 +135,7 @@ AltDiffusionPipeline, AudioLDMPipeline, CycleDiffusionPipeline, + FabricPipeline, IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 2e8cee9ce697..9618d6a4ca8a 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -52,6 +52,8 @@ StableDiffusionControlNetPipeline, StableDiffusionXLControlNetPipeline, ) + from .fabric import FabricPipeline + from .deepfloyd_if import ( IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index d5f7eb6b4fcc..e681ff9f1aeb 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -1,23 +1,13 @@ from ...utils import ( OptionalDependencyNotAvailable, - is_flax_available, is_torch_available, - is_transformers_available, ) - try: if not (is_transformers_available() and is_torch_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .multicontrolnet import MultiControlNetModel - from .pipeline_controlnet import StableDiffusionControlNetPipeline - from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline - from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline - from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline - + from .pipeline_fabric import FabricPipeline -if is_transformers_available() and is_flax_available(): - from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline From 2e6b75bc5c5024bf5168d4e31c673d02e5d59243 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 2 Aug 2023 23:56:07 +0530 Subject: [PATCH 36/98] starting with the pipeline --- docs/source/en/_toctree.yml | 2 + .../api/pipelines/stable_diffusion/fabric.mdx | 50 +++++++++++++ .../pipelines/fabric/pipeline_fabric.py | 73 ++++++++++++++++--- 3 files changed, 113 insertions(+), 12 deletions(-) create mode 100644 docs/source/en/api/pipelines/stable_diffusion/fabric.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ebde6cf97acd..9916af279bbe 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -210,6 +210,8 @@ title: DiffEdit - local: api/pipelines/dit title: DiT + - local: api/pipelines/fabric + title: FABRIC - local: api/pipelines/pix2pix title: InstructPix2Pix - local: api/pipelines/kandinsky diff --git a/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx b/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx new file mode 100644 index 000000000000..dc4996614ae0 --- /dev/null +++ b/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx @@ -0,0 +1,50 @@ +## changes required + + +# Text-to-Image Generation + +## StableDiffusionPipeline + +The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion. + +The original codebase can be found here: +- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) +- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion) + +Available Checkpoints are: +- *stable-diffusion-v1-4 (512x512 resolution)* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) +- *stable-diffusion-v1-5 (512x512 resolution)* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) +- *stable-diffusion-2-base (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) +- *stable-diffusion-2 (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) +- *stable-diffusion-2-1-base (512x512 resolution)* [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) +- *stable-diffusion-2-1 (768x768 resolution)*: [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) + +[[autodoc]] StableDiffusionPipeline + - all + - __call__ + - enable_attention_slicing + - disable_attention_slicing + - enable_vae_slicing + - disable_vae_slicing + - enable_xformers_memory_efficient_attention + - disable_xformers_memory_efficient_attention + - enable_vae_tiling + - disable_vae_tiling + - load_textual_inversion + - from_single_file + - load_lora_weights + - save_lora_weights + +[[autodoc]] FlaxStableDiffusionPipeline + - all + - __call__ diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 0bae7300e5f3..324c94bfcf12 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,15 +1,64 @@ -import warnings -from typing import List, Optional, Union +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -from PIL import Image -from tqdm import tqdm -from diffusers import ( - StableDiffusionPipeline, - EulerAncestralDiscreteScheduler, +from torch import nn +from torch.nn import functional as F + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, logging +from .cross_attention import AttnProcessor +from .embeddings import TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin +from .unet_2d_blocks import ( + CrossAttnDownBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + get_down_block, ) -from diffusers.models.attention import BasicTransformerBlock -from diffusers.models.cross_attention import LoRACrossAttnProcessor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class AttentionBasedGenerator(nn.Module): + def __init__(self): + super().__init__() + pass + + def generate( + self, + prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", + negative_prompt: Union[str, List[str]] = "", + liked: List[Image.Image] = [], + disliked: List[Image.Image] = [], + seed: int = 42, + n_images: int = 1, + guidance_scale: float = 8.0, + denoising_steps: int = 20, + feedback_start: float = 0.33, + feedback_end: float = 0.66, + min_weight: float = 0.1, + max_weight: float = 1.0, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + ) + pass + + + + From caa4bfe238fc3c91a14612b17951fccbc22baa2f Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 3 Aug 2023 10:32:34 +0530 Subject: [PATCH 37/98] stable diff --- .../pipelines/fabric/pipeline_fabric.py | 69 +++++++++++++++++-- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 324c94bfcf12..d29e352decdf 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -20,7 +20,10 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, logging -from .cross_attention import AttnProcessor +from .cross_attention import LoRACrossAttnProcessor +from .attention import BasicTransformerBlock +from .pipelines import StableDiffusionPipeline +from .scheduler import EulerAncestralDiscreateScheduler from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( @@ -35,16 +38,66 @@ class AttentionBasedGenerator(nn.Module): - def __init__(self): + def __init__( + self, + model_name: Optional[str] = None, + model_ckpt: Optional[str] = None, + stable_diffusion_version: str = "1.5", + lora_weights: Optional[str] = None, + torch_dtype=torch.float32, + ): super().__init__() - pass + if stable_diffusion_version == "2.1": + warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") + + if model_name is None: + if stable_diffusion_version == "1.5": + model_name = "runwayml/stable-diffusion-v1-5" + elif stable_diffusion_version == "2.1": + model_name = "stabilityai/stable-diffusion-2-1" + else: + raise ValueError( + f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" + ) + + scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") + + if model_ckpt is not None: + pipe = StableDiffusionPipeline.from_ckpt( + model_ckpt, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) + pipe.scheduler = scheduler + else: + pipe = StableDiffusionPipeline.from_pretrained( + model_name, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) + + if lora_weights: + print(f"Applying LoRA weights from {lora_weights}") + apply_unet_lora_weights( + pipeline=pipe, unet_path=lora_weights + ) + + self.pipeline = pipe + self.unet = pipe.unet + self.vae = pipe.vae + self.text_encoder = pipe.text_encoder + self.tokenizer = pipe.tokenizer + self.scheduler = scheduler + self.dtype = torch_dtype def generate( self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Union[str, List[str]] = "", - liked: List[Image.Image] = [], - disliked: List[Image.Image] = [], + negative_prompt: Optional[Union[str, List[str]]] = "", + liked: Optional[List[Image.Image]] = [], + disliked: Optional[List[Image.Image]] = [], seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -57,6 +110,10 @@ def generate( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ) + + with tqdm(total=denoising_steps) as pbar: + for i, t in enumerate(timestamp): + pass From 87665a8fdc65bcd3bcbb5e3d7761efa2de794685 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 3 Aug 2023 20:45:52 +0530 Subject: [PATCH 38/98] prev --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index d29e352decdf..ca671475db87 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -37,7 +37,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class AttentionBasedGenerator(nn.Module): +class FabricModel(nn.Module): def __init__( self, model_name: Optional[str] = None, @@ -47,6 +47,7 @@ def __init__( torch_dtype=torch.float32, ): super().__init__() + # Getting UNet from Stable diffusion if stable_diffusion_version == "2.1": warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") @@ -92,7 +93,7 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - def generate( + def forward( self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", negative_prompt: Optional[Union[str, List[str]]] = "", From 4f706a8d05f67a6fa8e5272a2b667af1b1c5b4f3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 5 Aug 2023 02:08:19 +0530 Subject: [PATCH 39/98] more things, getting started --- .../pipelines/fabric/pipeline_fabric.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ca671475db87..44afc884fd4d 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -111,6 +111,29 @@ def forward( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ) + if seed is not None: + torch.manual_seed(seed) + + if liked and len(liked) > 0: + pass + else: + pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) + + if disliked and len(disliked) > 0: + pass + else: + neg_latents = torch.Tensor([], device=self.device, dtype=self.dtype) + + if isinstance(prompt, str): + prompt = [prompt] * n_images + else: + assert len(prompts) == n_images + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] * n_images + else: + assert len(negative_prompts) == n_images + with tqdm(total=denoising_steps) as pbar: for i, t in enumerate(timestamp): From 697a0b0f80d2bbf18fac768263e3a8de348247ab Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 5 Aug 2023 11:30:47 +0530 Subject: [PATCH 40/98] more functions --- src/diffusers/pipelines/fabric/__init__.py | 2 +- .../pipelines/fabric/pipeline_fabric.py | 417 +++++++++++++++++- 2 files changed, 395 insertions(+), 24 deletions(-) diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index e681ff9f1aeb..db76db7bc0bf 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -4,7 +4,7 @@ ) try: - if not (is_transformers_available() and is_torch_available()): + if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 44afc884fd4d..017c7a1ccfc8 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -18,12 +18,12 @@ from torch import nn from torch.nn import functional as F -from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, logging -from .cross_attention import LoRACrossAttnProcessor -from .attention import BasicTransformerBlock -from .pipelines import StableDiffusionPipeline -from .scheduler import EulerAncestralDiscreateScheduler +from ...configuration_utils import ConfigMixin, register_to_config +from ...utils import BaseOutput, logging +from ...models.cross_attention import LoRACrossAttnProcessor +from ...models.attention import BasicTransformerBlock +from ..stable_diffusion import StableDiffusionPipeline +from ...schedulers import EulerAncestralDiscreateScheduler from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( @@ -36,18 +36,118 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name +def apply_unet_lora_weights(pipeline, unet_path): + model_weight = torch.load(unet_path, map_location="cpu") + unet = pipeline.unet + lora_attn_procs = {} + lora_rank = list( + set([v.size(0) for k, v in model_weight.items() if k.endswith("down.weight")]) + ) + assert len(lora_rank) == 1 + lora_rank = lora_rank[0] + for name in unet.attn_processors.keys(): + cross_attention_dim = ( + None + if name.endswith("attn1.processor") + else unet.config.cross_attention_dim + ) + if name.startswith("mid_block"): + hidden_size = unet.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = unet.config.block_out_channels[block_id] + + lora_attn_procs[name] = LoRACrossAttnProcessor( + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=lora_rank, + ).to(pipeline.device) + unet.set_attn_processor(lora_attn_procs) + unet.load_state_dict(model_weight, strict=False) + + +def attn_with_weights( + attn: nn.Module, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + weights=None, # shape: (batch_size, sequence_length) + lora_scale=1.0, +): + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size + ) + + if isinstance(attn.processor, LoRACrossAttnProcessor): + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( + hidden_states + ) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if isinstance(attn.processor, LoRACrossAttnProcessor): + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( + encoder_hidden_states + ) + value = attn.to_v( + encoder_hidden_states + ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + + if weights is not None: + if weights.shape[0] != 1: + weights = weights.repeat_interleave(attn.heads, dim=0) + attention_probs = attention_probs * weights[:, None] + attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + if isinstance(attn.processor, LoRACrossAttnProcessor): + hidden_states = attn.to_out[0]( + hidden_states + ) + lora_scale * attn.processor.to_out_lora(hidden_states) + else: + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + -class FabricModel(nn.Module): +class Fabric(nn.Module): def __init__( self, model_name: Optional[str] = None, model_ckpt: Optional[str] = None, stable_diffusion_version: str = "1.5", lora_weights: Optional[str] = None, - torch_dtype=torch.float32, + torch_dtype=torch.float32 ): super().__init__() - # Getting UNet from Stable diffusion + if stable_diffusion_version == "2.1": warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") @@ -93,12 +193,166 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype + @property + def device(self): + return next(self.parameters()).device + + def to(self, device): + self.pipeline.to(device) + return super().to(device) + + def initialize_prompts(self, prompts: List[str]): + prompt_tokens = self.tokenizer( + prompts, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + return prompt_embd + + def get_unet_hidden_states(self, z_all, t, prompt_embd): + cached_hidden_states = [] + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward(self, hidden_states, *args, **kwargs): + cached_hidden_states.append(hidden_states.clone().detach().cpu()) + return self.old_forward(hidden_states, *args, **kwargs) + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + # run forward pass to cache hidden states, output can be discarded + _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return cached_hidden_states + + def unet_forward_with_cached_hidden_states( + self, + z_all, + t, + prompt_embd, + cached_pos_hiddens: Optional[List[torch.Tensor]] = None, + cached_neg_hiddens: Optional[List[torch.Tensor]] = None, + pos_weights=(0.8, 0.8), + neg_weights=(0.5, 0.5), + ): + if cached_pos_hiddens is None and cached_neg_hiddens is None: + return self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + local_pos_weights = torch.linspace( + *pos_weights, steps=len(self.unet.down_blocks) + 1 + )[:-1].tolist() + local_neg_weights = torch.linspace( + *neg_weights, steps=len(self.unet.down_blocks) + 1 + )[:-1].tolist() + + for block, pos_weight, neg_weight in zip( + self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, + local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], + local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1], + ): + for module in block.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward( + self, + hidden_states, + pos_weight=pos_weight, + neg_weight=neg_weight, + **kwargs, + ): + cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0) + batch_size, d_model = cond_hiddens.shape[:2] + device, dtype = hidden_states.device, hidden_states.dtype + + weights = torch.ones( + batch_size, d_model, device=device, dtype=dtype + ) + + if cached_pos_hiddens is not None: + cached_pos_hs = cached_pos_hiddens.pop(0).to( + hidden_states.device + ) + cond_pos_hs = torch.cat( + [cond_hiddens, cached_pos_hs], dim=1 + ) + pos_weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + pos_weights[:, d_model:] = pos_weight + out_pos = attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_pos_hs, + weights=pos_weights, + ) + else: + out_pos = self.old_forward(cond_hiddens) + + if cached_neg_hiddens is not None: + cached_neg_hs = cached_neg_hiddens.pop(0).to( + hidden_states.device + ) + uncond_neg_hs = torch.cat( + [uncond_hiddens, cached_neg_hs], dim=1 + ) + neg_weights = weights.clone().repeat( + 1, 1 + cached_neg_hs.shape[1] // d_model + ) + neg_weights[:, d_model:] = neg_weight + out_neg = attn_with_weights( + self, + uncond_hiddens, + encoder_hidden_states=uncond_neg_hs, + weights=neg_weights, + ) + else: + out_neg = self.old_forward(uncond_hiddens) + + out = torch.cat([out_pos, out_neg], dim=0) + return out + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + out = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return out + def forward( - self, + self, prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Optional[Union[str, List[str]]] = "", - liked: Optional[List[Image.Image]] = [], - disliked: Optional[List[Image.Image]] = [], + negative_prompt: Union[str, List[str]] = "", + liked: List[Image.Image] = [], + disliked: List[Image.Image] = [], seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -110,36 +364,153 @@ def forward( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - ) + ): + """ + Generate a trajectory of images with binary feedback. + The feedback can be given as a list of liked and disliked images. + """ if seed is not None: torch.manual_seed(seed) + z = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + if liked and len(liked) > 0: - pass + pos_images = [self.image_to_tensor(img) for img in liked] + pos_images = torch.stack(pos_images).to(self.device, dtype=self.dtype) + pos_latents = ( + self.vae.config.scaling_factor + * self.vae.encode(pos_images).latent_dist.sample() + ) else: pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) if disliked and len(disliked) > 0: - pass + neg_images = [self.image_to_tensor(img) for img in disliked] + neg_images = torch.stack(neg_images).to(self.device, dtype=self.dtype) + neg_latents = ( + self.vae.config.scaling_factor + * self.vae.encode(neg_images).latent_dist.sample() + ) else: - neg_latents = torch.Tensor([], device=self.device, dtype=self.dtype) + neg_latents = torch.tensor([], device=self.device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images else: - assert len(prompts) == n_images - + assert len(prompt) == n_images if isinstance(negative_prompt, str): negative_prompt = [negative_prompt] * n_images else: - assert len(negative_prompts) == n_images + assert len(negative_prompt) == n_images + + ( + cond_prompt_embs, + uncond_prompt_embs, + null_prompt_emb, + ) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) + self.scheduler.set_timesteps(denoising_steps, device=self.device) + timesteps = self.scheduler.timesteps + + z = z * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start) + ref_end_idx = round(len(timesteps) * feedback_end) with tqdm(total=denoising_steps) as pbar: - for i, t in enumerate(timestamp): - - pass + for i, t in enumerate(timesteps): + if hasattr(self.scheduler, "sigma_t"): + sigma = self.scheduler.sigma_t[t] + elif hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + else: + sigma = 0 + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(z, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([pos_latents, neg_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight = max_weight + else: + weight = min_weight + pos_ws = (weight, weight * pos_bottleneck_scale) + neg_ws = (weight * neg_scale, weight * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = ( + alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise + ) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat([null_prompt_emb] * (pos_latents.size(0) + neg_latents.size(0)), dim=0) + + cached_hidden_states = self.get_unet_hidden_states( + z_ref_noised, t, ref_prompt_embd + ) + + n_pos, n_neg = pos_latents.shape[0], neg_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view( + 1, -1, *cached_pos.shape[2:] + ).expand(n_images, -1, -1) + cached_neg = cached_neg.view( + 1, -1, *cached_neg.shape[2:] + ).expand(n_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None + + unet_out = self.unet_forward_with_cached_hidden_states( + z_all, + t, + prompt_embd=batched_prompt_embd, + cached_pos_hiddens=cached_pos_hs, + cached_neg_hiddens=cached_neg_hs, + pos_weights=pos_ws, + neg_weights=neg_ws, + ).sample + + noise_cond, noise_uncond = unet_out.chunk(2) + guidance = noise_cond - noise_uncond + noise_pred = noise_uncond + guidance_scale * guidance + z = self.scheduler.step(noise_pred, t, z).prev_sample + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + pbar.update() + y = self.pipeline.decode_latents(z) + imgs = self.pipeline.numpy_to_pil(y) + return imgs + @staticmethod + def image_to_tensor(image: Union[str, Image.Image]): + """ + Convert a PIL image to a torch tensor. + """ + if isinstance(image, str): + image = Image.open(image) + if not image.mode == "RGB": + image = image.convert("RGB") + image = image.resize((512, 512)) + image = np.array(image).astype(np.uint8) + image = (image / 127.5 - 1.0).astype(np.float32) + return torch.from_numpy(image).permute(2, 0, 1) From bddf128bbbcb6b1ab7e567ffabb88977a416671d Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 9 Aug 2023 16:34:53 +0530 Subject: [PATCH 41/98] makeing it more readable --- .../pipelines/fabric/pipeline_fabric.py | 111 ++++++++---------- 1 file changed, 49 insertions(+), 62 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 017c7a1ccfc8..af02389c05eb 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -20,6 +20,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging +from ...pipelines import DissusionPipeline from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock from ..stable_diffusion import StableDiffusionPipeline @@ -137,12 +138,12 @@ def attn_with_weights( return hidden_states -class Fabric(nn.Module): +class Fabric(DiffusionPipeline): def __init__( self, model_name: Optional[str] = None, - model_ckpt: Optional[str] = None, stable_diffusion_version: str = "1.5", + scheduler: EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, torch_dtype=torch.float32 ): @@ -163,21 +164,12 @@ def __init__( scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - if model_ckpt is not None: - pipe = StableDiffusionPipeline.from_ckpt( - model_ckpt, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ) - pipe.scheduler = scheduler - else: - pipe = StableDiffusionPipeline.from_pretrained( - model_name, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ) + pipe = StableDiffusionPipeline.from_pretrained( + model_name, + scheduler=scheduler, + torch_dtype=torch_dtype, + safety_checker=None, + ) if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -347,18 +339,29 @@ def new_forward( return out - def forward( + def preprocess_feedback_images(images, vae) -> torch.tensor: + images_t = [self.image_to_tensor(img) for img in images] + images_t = torch.stack(images_t).to(self.device, dtype=self.dtype) + latents = ( + vae.config.scaling_factor + * vae.encode(iamges_t).latent_dist.sample() + ) + return latents + + @torch.no_grad() + + def __call__( self, - prompt: Union[str, List[str]] = "a photo of an astronaut riding a horse on mars", - negative_prompt: Union[str, List[str]] = "", - liked: List[Image.Image] = [], - disliked: List[Image.Image] = [], - seed: int = 42, + prompt: Optional[Union[str, List[str]]] = None, + negative_prompt: Optional[Union[str, List[str]]] = None, + liked: Optional[List[Image.Image]] = None, + disliked: Optional[List[Image.Image]] = None, + random_seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, denoising_steps: int = 20, - feedback_start: float = 0.33, - feedback_end: float = 0.66, + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, min_weight: float = 0.1, max_weight: float = 1.0, neg_scale: float = 0.5, @@ -369,30 +372,15 @@ def forward( Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if seed is not None: + + if random_seed is not None: torch.manual_seed(seed) - z = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) - if liked and len(liked) > 0: - pos_images = [self.image_to_tensor(img) for img in liked] - pos_images = torch.stack(pos_images).to(self.device, dtype=self.dtype) - pos_latents = ( - self.vae.config.scaling_factor - * self.vae.encode(pos_images).latent_dist.sample() - ) - else: - pos_latents = torch.tensor([], device=self.device, dtype=self.dtype) - - if disliked and len(disliked) > 0: - neg_images = [self.image_to_tensor(img) for img in disliked] - neg_images = torch.stack(neg_images).to(self.device, dtype=self.dtype) - neg_latents = ( - self.vae.config.scaling_factor - * self.vae.encode(neg_images).latent_dist.sample() - ) - else: - neg_latents = torch.tensor([], device=self.device, dtype=self.dtype) + positive_letents = self.preprocess_feedback_images(liked,self.vae) if liked and len(liked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + + negative_letents = self.preprocess_feedback_images(disliked,self.vae) if disliked and len(disliked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -403,43 +391,42 @@ def forward( else: assert len(negative_prompt) == n_images - ( - cond_prompt_embs, - uncond_prompt_embs, - null_prompt_emb, - ) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) self.scheduler.set_timesteps(denoising_steps, device=self.device) timesteps = self.scheduler.timesteps - z = z * self.scheduler.init_noise_sigma + latent_noise = latent_noise * self.scheduler.init_noise_sigma num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order - ref_start_idx = round(len(timesteps) * feedback_start) - ref_end_idx = round(len(timesteps) * feedback_end) + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) with tqdm(total=denoising_steps) as pbar: for i, t in enumerate(timesteps): - if hasattr(self.scheduler, "sigma_t"): - sigma = self.scheduler.sigma_t[t] - elif hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 + if hasattr(self.scheduler, "sigmas"): sigma = self.scheduler.sigmas[i] - else: - sigma = 0 + alpha_hat = 1 / (sigma**2 + 1) - z_single = self.scheduler.scale_model_input(z, t) + z_single = self.scheduler.scale_model_input(latent_noise, t) z_all = torch.cat([z_single] * 2, dim=0) z_ref = torch.cat([pos_latents, neg_latents], dim=0) + weight_factor = self.get_current_weight_factor(i, denoising_step, ref_start_idx, + ref_end_idx, min_weight, max_weight) if i >= ref_start_idx and i <= ref_end_idx: weight = max_weight else: weight = min_weight - pos_ws = (weight, weight * pos_bottleneck_scale) - neg_ws = (weight * neg_scale, weight * neg_scale * neg_bottleneck_scale) + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) if z_ref.size(0) > 0 and weight > 0: noise = torch.randn_like(z_ref) From 5ab690f3c1665220b4a0c4346b247c4ab93a9656 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 01:52:06 +0530 Subject: [PATCH 42/98] almost done testing --- src/diffusers/pipelines/fabric/__init__.py | 22 +++ .../pipelines/fabric/pipeline_fabric.py | 137 +++++++++--------- 2 files changed, 88 insertions(+), 71 deletions(-) diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index db76db7bc0bf..ca8e828d2ae6 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -1,7 +1,29 @@ +from dataclasses import dataclass from ...utils import ( + BaseOutput, OptionalDependencyNotAvailable, is_torch_available, ) +from typing import Union, Optional, List +import numpy as np +import PIL + +@dataclass +class FabricPipelineOutput(BaseOutput): + """ + Output class for Stable Diffusion pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_content_detected (`Optional[List[bool]]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, or `None` if safety checking could not be performed. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] + nsfw_content_detected: Optional[List[bool]] try: if not is_torch_available(): diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index af02389c05eb..20da34bb1163 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -33,6 +33,7 @@ UNetMidBlock2DCrossAttn, get_down_block, ) +from . import FabricPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -143,7 +144,7 @@ def __init__( self, model_name: Optional[str] = None, stable_diffusion_version: str = "1.5", - scheduler: EulerAncestralDiscreteScheduler, + scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, torch_dtype=torch.float32 ): @@ -194,28 +195,30 @@ def to(self, device): return super().to(device) def initialize_prompts(self, prompts: List[str]): - prompt_tokens = self.tokenizer( - prompts, - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - return prompt_embd + # Breaking into individual prompts feels memory efficient + prompt_embed_list = [] + for prompt in prompts: + prompt_tokens = self.tokenizer( + prompt, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + attention_mask = prompt_tokens.attention_mask.to(self.device) if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ) else None + + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + prompt_embed_list.append(prompt_embd) + + return torch.cat(prompt_embed_list, dim=0) def get_unet_hidden_states(self, z_all, t, prompt_embd): cached_hidden_states = [] @@ -254,11 +257,31 @@ def unet_forward_with_cached_hidden_states( return self.unet(z_all, t, encoder_hidden_states=prompt_embd) local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1 - )[:-1].tolist() + *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1] local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1 - )[:-1].tolist() + *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + + def new_forward_caching(module, hidden_states, cached_hiddens, weight, is_positive): + cached_hs = cached_hiddens.pop(0).to( + hidden_states.device + ) + cond_hs = torch.cat( + [hidden_states, cached_hs], dim=1 + ) + weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), + weight, device=hidden_states.device) + weights[:, hidden_states.size(1):] = 1.0 + out = attn_with_weights( + self, + hidden_states, + encoder_hidden_states=cond_hs, + weights=weights, + ) + return out + for block, pos_weight, neg_weight in zip( self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, @@ -283,45 +306,19 @@ def new_forward( batch_size, d_model, device=device, dtype=dtype ) + out_pos = self.old_forward(hidden_states) + out_neg = self.old_forward(hidden_states) + if cached_pos_hiddens is not None: - cached_pos_hs = cached_pos_hiddens.pop(0).to( - hidden_states.device - ) - cond_pos_hs = torch.cat( - [cond_hiddens, cached_pos_hs], dim=1 - ) - pos_weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model - ) - pos_weights[:, d_model:] = pos_weight - out_pos = attn_with_weights( - self, - cond_hiddens, - encoder_hidden_states=cond_pos_hs, - weights=pos_weights, - ) - else: - out_pos = self.old_forward(cond_hiddens) + out_pos = new_forward_caching( + self, hidden_states, cached_pos_hiddens, + pos_weight, is_positive=True) + if cached_neg_hiddens is not None: - cached_neg_hs = cached_neg_hiddens.pop(0).to( - hidden_states.device - ) - uncond_neg_hs = torch.cat( - [uncond_hiddens, cached_neg_hs], dim=1 - ) - neg_weights = weights.clone().repeat( - 1, 1 + cached_neg_hs.shape[1] // d_model - ) - neg_weights[:, d_model:] = neg_weight - out_neg = attn_with_weights( - self, - uncond_hiddens, - encoder_hidden_states=uncond_neg_hs, - weights=neg_weights, - ) - else: - out_neg = self.old_forward(uncond_hiddens) + out_neg = new_forward_caching( + self, hidden_states, cached_neg_hiddens, + neg_weight, is_positive=False) out = torch.cat([out_pos, out_neg], dim=0) return out @@ -418,17 +415,15 @@ def __call__( z_all = torch.cat([z_single] * 2, dim=0) z_ref = torch.cat([pos_latents, neg_latents], dim=0) - weight_factor = self.get_current_weight_factor(i, denoising_step, ref_start_idx, - ref_end_idx, min_weight, max_weight) if i >= ref_start_idx and i <= ref_end_idx: - weight = max_weight + weight_factor = max_weight else: - weight = min_weight + weight_factor = min_weight pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - if z_ref.size(0) > 0 and weight > 0: + if z_ref.size(0) > 0 and weight_factor > 0: noise = torch.randn_like(z_ref) if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): z_ref_noised = ( @@ -437,13 +432,13 @@ def __call__( else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - ref_prompt_embd = torch.cat([null_prompt_emb] * (pos_latents.size(0) + neg_latents.size(0)), dim=0) + ref_prompt_embd = torch.cat([null_prompt_emb] * (len(posotive_latents) + len(negative_latents)), dim=0) cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd ) - n_pos, n_neg = pos_latents.shape[0], neg_latents.shape[0] + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] cached_pos_hs, cached_neg_hs = [], [] for hs in cached_hidden_states: cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) @@ -486,7 +481,7 @@ def __call__( y = self.pipeline.decode_latents(z) imgs = self.pipeline.numpy_to_pil(y) - return imgs + return FabricPipelineOutpur(imgs) @staticmethod def image_to_tensor(image: Union[str, Image.Image]): From 4f8bc736e3db4065ab4cb7460f306aa57695598c Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 02:02:59 +0530 Subject: [PATCH 43/98] var changes --- .../pipelines/fabric/pipeline_fabric.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 20da34bb1163..dffecc13962e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -17,24 +17,19 @@ import torch from torch import nn from torch.nn import functional as F +from PIL import Image +import numpy as np from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging -from ...pipelines import DissusionPipeline from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock from ..stable_diffusion import StableDiffusionPipeline -from ...schedulers import EulerAncestralDiscreateScheduler -from .embeddings import TimestepEmbedding, Timesteps -from .modeling_utils import ModelMixin -from .unet_2d_blocks import ( - CrossAttnDownBlock2D, - DownBlock2D, - UNetMidBlock2DCrossAttn, - get_down_block, -) +from ...schedulers import EulerAncestralDiscreteScheduler from . import FabricPipelineOutput +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -139,7 +134,7 @@ def attn_with_weights( return hidden_states -class Fabric(DiffusionPipeline): +class FabricPipeline(DiffusionPipeline): def __init__( self, model_name: Optional[str] = None, From 8476267b4b223d9b6b8bc48f564a1b91521c0449 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 02:46:29 +0530 Subject: [PATCH 44/98] testing --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index dffecc13962e..efa84fd2e28e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -366,7 +366,7 @@ def __call__( """ if random_seed is not None: - torch.manual_seed(seed) + torch.manual_seed(random_seed) latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) From 90e395bb41d474f80b830f7bf3db5b95783de62c Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 11:44:33 +0530 Subject: [PATCH 45/98] device --- .../pipelines/fabric/pipeline_fabric.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index efa84fd2e28e..83e97b5284ad 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -19,6 +19,7 @@ from torch.nn import functional as F from PIL import Image import numpy as np +from tqdm import tqdm from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging @@ -181,13 +182,13 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - @property - def device(self): - return next(self.parameters()).device + #@property + #def device(self): + # return next(self.parameters()).device - def to(self, device): - self.pipeline.to(device) - return super().to(device) + #def to(self, device): + # self.pipeline.to(device) + # return super().to(device) def initialize_prompts(self, prompts: List[str]): # Breaking into individual prompts feels memory efficient @@ -331,9 +332,9 @@ def new_forward( return out - def preprocess_feedback_images(images, vae) -> torch.tensor: + def preprocess_feedback_images(images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] - images_t = torch.stack(images_t).to(self.device, dtype=self.dtype) + images_t = torch.stack(images_t).to(device, dtype=self.dtype) latents = ( vae.config.scaling_factor * vae.encode(iamges_t).latent_dist.sample() @@ -344,10 +345,10 @@ def preprocess_feedback_images(images, vae) -> torch.tensor: def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, - liked: Optional[List[Image.Image]] = None, - disliked: Optional[List[Image.Image]] = None, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "", + liked: Optional[List[Image.Image]] = [], + disliked: Optional[List[Image.Image]] = [], random_seed: int = 42, n_images: int = 1, guidance_scale: float = 8.0, @@ -364,15 +365,16 @@ def __call__( Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if random_seed is not None: torch.manual_seed(random_seed) + + device = self._execution_device - latent_noise = torch.randn(n_images, 4, 64, 64, device=self.device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) - positive_letents = self.preprocess_feedback_images(liked,self.vae) if liked and len(liked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device, dtype=self.dtype) - negative_letents = self.preprocess_feedback_images(disliked,self.vae) if disliked and len(disliked)>0 else torch.tensor([], device=self.device, dtype=self.dtype) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=self.dtype) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -388,7 +390,7 @@ def __call__( batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) - self.scheduler.set_timesteps(denoising_steps, device=self.device) + self.scheduler.set_timesteps(denoising_steps, device=device) timesteps = self.scheduler.timesteps latent_noise = latent_noise * self.scheduler.init_noise_sigma @@ -408,7 +410,7 @@ def __call__( z_single = self.scheduler.scale_model_input(latent_noise, t) z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([pos_latents, neg_latents], dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) if i >= ref_start_idx and i <= ref_end_idx: weight_factor = max_weight @@ -466,14 +468,14 @@ def __call__( noise_cond, noise_uncond = unet_out.chunk(2) guidance = noise_cond - noise_uncond noise_pred = noise_uncond + guidance_scale * guidance - z = self.scheduler.step(noise_pred, t, z).prev_sample + latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample if i == len(timesteps) - 1 or ( (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 ): pbar.update() - y = self.pipeline.decode_latents(z) + y = self.pipeline.decode_latents(latent_noise) imgs = self.pipeline.numpy_to_pil(y) return FabricPipelineOutpur(imgs) @@ -491,3 +493,4 @@ def image_to_tensor(image: Union[str, Image.Image]): image = np.array(image).astype(np.uint8) image = (image / 127.5 - 1.0).astype(np.float32) return torch.from_numpy(image).permute(2, 0, 1) + From 32a28b3df2f3a4d91235af72e72f93e4bf01965a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 12:48:27 +0530 Subject: [PATCH 46/98] device support --- .../pipelines/fabric/pipeline_fabric.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 83e97b5284ad..dab5bd9cf6b5 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -142,7 +142,7 @@ def __init__( stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, - torch_dtype=torch.float32 + torch_dtype = None, ): super().__init__() @@ -174,13 +174,20 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) + if self._execution_device is not "cuda": + torch_dtype = torch.float32 + else: + torch_dtype = torch_dtype if torch_dtype else torch.float16 + self.pipeline = pipe self.unet = pipe.unet self.vae = pipe.vae self.text_encoder = pipe.text_encoder self.tokenizer = pipe.tokenizer self.scheduler = scheduler + self.dtype = torch_dtype + self.device = self._execution_device #@property #def device(self): @@ -190,7 +197,7 @@ def __init__( # self.pipeline.to(device) # return super().to(device) - def initialize_prompts(self, prompts: List[str]): + def initialize_prompts(self, prompts: List[str], device): # Breaking into individual prompts feels memory efficient prompt_embed_list = [] for prompt in prompts: @@ -202,13 +209,13 @@ def initialize_prompts(self, prompts: List[str]): truncation=True, ) - attention_mask = prompt_tokens.attention_mask.to(self.device) if ( + attention_mask = prompt_tokens.attention_mask.to(device) if ( hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask ) else None prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(self.device), + input_ids=prompt_tokens.input_ids.to(device), attention_mask=attention_mask, ).last_hidden_state @@ -386,7 +393,7 @@ def __call__( assert len(negative_prompt) == n_images - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""]).split([n_images, n_images, 1]) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) From 12f27726137e5ffe54ac2f7ce4256bf9991c39ef Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 12:54:07 +0530 Subject: [PATCH 47/98] maybe --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index dab5bd9cf6b5..4bd8b00d6095 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -159,6 +159,11 @@ def __init__( f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" ) + if self._execution_device is not "cuda": + torch_dtype = torch.float32 + else: + torch_dtype = torch_dtype if torch_dtype else torch.float16 + scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( @@ -174,10 +179,6 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - if self._execution_device is not "cuda": - torch_dtype = torch.float32 - else: - torch_dtype = torch_dtype if torch_dtype else torch.float16 self.pipeline = pipe self.unet = pipe.unet From cf18db86be8fcce19bf78263282c22ec57d769e0 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 13:19:48 +0530 Subject: [PATCH 48/98] device malfunctions --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 4bd8b00d6095..04115ba1b752 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -188,7 +188,6 @@ def __init__( self.scheduler = scheduler self.dtype = torch_dtype - self.device = self._execution_device #@property #def device(self): From 24802459e887888660f8a8fa6b07e2484bfd8a27 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:02:27 +0530 Subject: [PATCH 49/98] new new --- .../pipelines/fabric/pipeline_fabric.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 04115ba1b752..837c6a270323 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -246,6 +246,34 @@ def new_forward(self, hidden_states, *args, **kwargs): return cached_hidden_states + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + def unet_forward_with_cached_hidden_states( self, z_all, From be6ddf3e9e2fdd2abd92ba15779c7199ebafb36a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:13:20 +0530 Subject: [PATCH 50/98] register --- .../pipelines/fabric/pipeline_fabric.py | 41 ++++--------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 837c6a270323..bdd7e422f8fd 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -159,11 +159,6 @@ def __init__( f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" ) - if self._execution_device is not "cuda": - torch_dtype = torch.float32 - else: - torch_dtype = torch_dtype if torch_dtype else torch.float16 - scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( @@ -179,6 +174,14 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) + + self.register_modules( + vae=pipe.vae, + text_encoder=pipe.text_encoder, + tokenizer=pipe.tokenizer, + unet=pipe.unet, + scheduler=scheduler, + ) self.pipeline = pipe self.unet = pipe.unet @@ -246,34 +249,6 @@ def new_forward(self, hidden_states, *args, **kwargs): return cached_hidden_states - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - def unet_forward_with_cached_hidden_states( self, z_all, From 0463bd68db2055d20a90da6677a0096dbfa457bb Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 18:41:37 +0530 Subject: [PATCH 51/98] testing --- .../pipelines/fabric/pipeline_fabric.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index bdd7e422f8fd..737b77f1b388 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -21,6 +21,22 @@ import numpy as np from tqdm import tqdm +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + from ...configuration_utils import ConfigMixin, register_to_config from ...utils import BaseOutput, logging from ...models.cross_attention import LoRACrossAttnProcessor @@ -138,7 +154,17 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - model_name: Optional[str] = None, + #model_name: Optional[str] = None, + + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + #scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, @@ -174,20 +200,13 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - - self.register_modules( - vae=pipe.vae, - text_encoder=pipe.text_encoder, - tokenizer=pipe.tokenizer, - unet=pipe.unet, - scheduler=scheduler, - ) + self.register_modules(unet=unet, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler) self.pipeline = pipe - self.unet = pipe.unet - self.vae = pipe.vae - self.text_encoder = pipe.text_encoder - self.tokenizer = pipe.tokenizer + self.unet = unet + self.vae = vae + self.text_encoder = text_encoder + self.tokenizer = tokenizer self.scheduler = scheduler self.dtype = torch_dtype From 5b097843cecec23c2c6e600ebcbbc5da3425c9ca Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 19:00:39 +0530 Subject: [PATCH 52/98] exec does not work --- .../pipelines/fabric/pipeline_fabric.py | 29 ++++++------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 737b77f1b388..65c4d76176c4 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -154,17 +154,7 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - #model_name: Optional[str] = None, - - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - #scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool = True, - + model_name: Optional[str] = None, stable_diffusion_version: str = "1.5", scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, @@ -192,7 +182,7 @@ def __init__( scheduler=scheduler, torch_dtype=torch_dtype, safety_checker=None, - ) + ).to("cuda") if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -200,13 +190,11 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - self.register_modules(unet=unet, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler) - self.pipeline = pipe - self.unet = unet - self.vae = vae - self.text_encoder = text_encoder - self.tokenizer = tokenizer + self.unet = pipe.unet + self.vae = pipe.vae + self.text_encoder = pipe.text_encoder + self.tokenizer = pipe.tokenizer self.scheduler = scheduler self.dtype = torch_dtype @@ -397,7 +385,7 @@ def __call__( if random_seed is not None: torch.manual_seed(random_seed) - device = self._execution_device + device = torch.device("cuda") latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) @@ -507,7 +495,7 @@ def __call__( y = self.pipeline.decode_latents(latent_noise) imgs = self.pipeline.numpy_to_pil(y) - return FabricPipelineOutpur(imgs) + return FabricPipelineOutput(imgs,False) @staticmethod def image_to_tensor(image: Union[str, Image.Image]): @@ -523,3 +511,4 @@ def image_to_tensor(image: Union[str, Image.Image]): image = (image / 127.5 - 1.0).astype(np.float32) return torch.from_numpy(image).permute(2, 0, 1) + From 413d45079aee3814826bfae1f01d0115aa4fbbbe Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 11 Aug 2023 20:17:12 +0530 Subject: [PATCH 53/98] float --- .../pipelines/fabric/pipeline_fabric.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 65c4d76176c4..c499485640a6 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -270,19 +270,19 @@ def unet_forward_with_cached_hidden_states( return self.unet(z_all, t, encoder_hidden_states=prompt_embd) local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1] + *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cached_hiddens, weight, is_positive): + def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, d_model, weight, is_positive): cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) cond_hs = torch.cat( - [hidden_states, cached_hs], dim=1 + [cond_hiddens, cached_hs], dim=1 ) - weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model + weights = weight.clone().repeat( + 1, 1 + cached_hs.shape[1] // d_model ) weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), weight, device=hidden_states.device) @@ -324,13 +324,13 @@ def new_forward( if cached_pos_hiddens is not None: out_pos = new_forward_caching( - self, hidden_states, cached_pos_hiddens, + self, hidden_states, cond_hiddens, cached_pos_hiddens, d_model, pos_weight, is_positive=True) if cached_neg_hiddens is not None: out_neg = new_forward_caching( - self, hidden_states, cached_neg_hiddens, + self, hidden_states, uncond_hiddens, cached_neg_hiddens, d_model, neg_weight, is_positive=False) out = torch.cat([out_pos, out_neg], dim=0) @@ -349,12 +349,12 @@ def new_forward( return out - def preprocess_feedback_images(images, vae, device) -> torch.tensor: + def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] images_t = torch.stack(images_t).to(device, dtype=self.dtype) latents = ( vae.config.scaling_factor - * vae.encode(iamges_t).latent_dist.sample() + * vae.encode(images_t).latent_dist.sample() ) return latents @@ -446,7 +446,7 @@ def __call__( else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - ref_prompt_embd = torch.cat([null_prompt_emb] * (len(posotive_latents) + len(negative_latents)), dim=0) + ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd @@ -512,3 +512,4 @@ def image_to_tensor(image: Union[str, Image.Image]): return torch.from_numpy(image).permute(2, 0, 1) + From e44e8ce1136849c5edf8c646c11e6338247359f3 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 01:22:25 +0530 Subject: [PATCH 54/98] change info --- .../pipelines/fabric/pipeline_fabric.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index c499485640a6..c3fee2f60bca 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -274,22 +274,20 @@ def unet_forward_with_cached_hidden_states( local_neg_weights = torch.linspace( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, d_model, weight, is_positive): + def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) cond_hs = torch.cat( [cond_hiddens, cached_hs], dim=1 ) - weights = weight.clone().repeat( - 1, 1 + cached_hs.shape[1] // d_model + weights = weights.clone().repeat( + 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) - weights = torch.full((cond_hs.size(0), cond_hs.size(1) // hidden_states.size(1)), - weight, device=hidden_states.device) - weights[:, hidden_states.size(1):] = 1.0 + weights[:, hidden_states.size(1):] = weight out = attn_with_weights( self, - hidden_states, + cond_hs, encoder_hidden_states=cond_hs, weights=weights, ) @@ -324,14 +322,14 @@ def new_forward( if cached_pos_hiddens is not None: out_pos = new_forward_caching( - self, hidden_states, cond_hiddens, cached_pos_hiddens, d_model, - pos_weight, is_positive=True) + self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, + weights) if cached_neg_hiddens is not None: out_neg = new_forward_caching( - self, hidden_states, uncond_hiddens, cached_neg_hiddens, d_model, - neg_weight, is_positive=False) + self, hidden_states, uncond_hiddens, cached_neg_hiddens, + neg_weight, weights) out = torch.cat([out_pos, out_neg], dim=0) return out From 3e4965a176b2ce81aef71b036b783528eb746b79 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 21:15:55 +0530 Subject: [PATCH 55/98] change of architecture --- .../pipelines/fabric/pipeline_fabric.py | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index c3fee2f60bca..8a4f4dacf6b1 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,3 +1,5 @@ +### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that +### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -154,35 +156,36 @@ def attn_with_weights( class FabricPipeline(DiffusionPipeline): def __init__( self, - model_name: Optional[str] = None, - stable_diffusion_version: str = "1.5", - scheduler: EulerAncestralDiscreteScheduler = EulerAncestralDiscreteScheduler, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: EulerAncestralDiscreteScheduler, lora_weights: Optional[str] = None, - torch_dtype = None, ): super().__init__() - if stable_diffusion_version == "2.1": - warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") + #if stable_diffusion_version == "2.1": + # warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") - if model_name is None: - if stable_diffusion_version == "1.5": - model_name = "runwayml/stable-diffusion-v1-5" - elif stable_diffusion_version == "2.1": - model_name = "stabilityai/stable-diffusion-2-1" - else: - raise ValueError( - f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" - ) + #if model_name is None: + # if stable_diffusion_version == "1.5": + # model_name = "runwayml/stable-diffusion-v1-5" + # elif stable_diffusion_version == "2.1": + # model_name = "stabilityai/stable-diffusion-2-1" + # else: + # raise ValueError( + # f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" + # ) - scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") + #scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - pipe = StableDiffusionPipeline.from_pretrained( - model_name, - scheduler=scheduler, - torch_dtype=torch_dtype, - safety_checker=None, - ).to("cuda") + # pipe = StableDiffusionPipeline.from_pretrained( + # model_name, + # scheduler=scheduler, + # torch_dtype=torch_dtype, + # safety_checker=None, + # ).to("cuda") if lora_weights: print(f"Applying LoRA weights from {lora_weights}") @@ -190,14 +193,13 @@ def __init__( pipeline=pipe, unet_path=lora_weights ) - self.pipeline = pipe - self.unet = pipe.unet - self.vae = pipe.vae - self.text_encoder = pipe.text_encoder - self.tokenizer = pipe.tokenizer - self.scheduler = scheduler - - self.dtype = torch_dtype + self.register_modules( + unet = unet, + vae = vae, + text_encoder = text_encoder, + tokenizer = tokenizer, + scheduler = scheduler, + ) #@property #def device(self): @@ -224,6 +226,8 @@ def initialize_prompts(self, prompts: List[str], device): and self.text_encoder.config.use_attention_mask ) else None + print("Asdfsdf",attention_mask) + prompt_embd = self.text_encoder( input_ids=prompt_tokens.input_ids.to(device), attention_mask=attention_mask, @@ -287,7 +291,7 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei weights[:, hidden_states.size(1):] = weight out = attn_with_weights( self, - cond_hs, + cond_hiddens, encoder_hidden_states=cond_hs, weights=weights, ) @@ -325,7 +329,6 @@ def new_forward( self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, weights) - if cached_neg_hiddens is not None: out_neg = new_forward_caching( self, hidden_states, uncond_hiddens, cached_neg_hiddens, @@ -349,7 +352,7 @@ def new_forward( def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: images_t = [self.image_to_tensor(img) for img in images] - images_t = torch.stack(images_t).to(device, dtype=self.dtype) + images_t = torch.stack(images_t).to(device) latents = ( vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() @@ -383,13 +386,13 @@ def __call__( if random_seed is not None: torch.manual_seed(random_seed) - device = torch.device("cuda") + device = self._execution_device - latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=self.dtype) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device) - positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device, dtype=self.dtype) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=self.dtype) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device) if isinstance(prompt, str): prompt = [prompt] * n_images From 1ab807d02ef1d5cfc1ac3cdded2f6b475a5615c4 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 21:22:18 +0530 Subject: [PATCH 56/98] might work --- .../pipelines/fabric/pipeline_fabric.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 8a4f4dacf6b1..cc0c63d68c9d 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -200,6 +200,8 @@ def __init__( tokenizer = tokenizer, scheduler = scheduler, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) #@property #def device(self): @@ -359,6 +361,19 @@ def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: ) return latents + def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Plea se" + " use VaeImageProcessor instead", + FutureWarning, + ) + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + @torch.no_grad() def __call__( @@ -493,8 +508,8 @@ def __call__( ): pbar.update() - y = self.pipeline.decode_latents(latent_noise) - imgs = self.pipeline.numpy_to_pil(y) + y = self.decode_latents(latent_noise) + imgs = self.image_processor.numpy_to_pil(y) return FabricPipelineOutput(imgs,False) From f603f654c551df0aef8d765d3d6ad70800e661ab Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 12 Aug 2023 23:13:35 +0530 Subject: [PATCH 57/98] testing with colab --- .../pipelines/fabric/pipeline_fabric.py | 144 +++++++++--------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index cc0c63d68c9d..ecec00e64759 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -21,6 +21,7 @@ from torch.nn import functional as F from PIL import Image import numpy as np +import warnings from tqdm import tqdm from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer @@ -85,72 +86,77 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -def attn_with_weights( - attn: nn.Module, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - weights=None, # shape: (batch_size, sequence_length) - lora_scale=1.0, -): - batch_size, sequence_length, _ = ( - hidden_states.shape - if encoder_hidden_states is None - else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size - ) +class CrossAttnStoreProcessor() + def __init__(self): + self.attntion_probs = None - if isinstance(attn.processor, LoRACrossAttnProcessor): - query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( - hidden_states + def __call__( + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + weights=None, # shape: (batch_size, sequence_length) + lora_scale=1.0, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size ) - else: - query = attn.to_q(hidden_states) - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + if isinstance(attn.processor, LoRACrossAttnProcessor): + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( + hidden_states + ) + else: + query = attn.to_q(hidden_states) - if isinstance(attn.processor, LoRACrossAttnProcessor): - key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( - encoder_hidden_states - ) - value = attn.to_v( - encoder_hidden_states - ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) - else: - key = attn.to_k(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) + if isinstance(attn.processor, LoRACrossAttnProcessor): + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( + encoder_hidden_states + ) + value = attn.to_v( + encoder_hidden_states + ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) - attention_probs = attn.get_attention_scores(query, key, attention_mask) + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) - if weights is not None: - if weights.shape[0] != 1: - weights = weights.repeat_interleave(attn.heads, dim=0) - attention_probs = attention_probs * weights[:, None] - attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + attention_probs = attn.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) + if weights is not None: + if weights.shape[0] != 1: + weights = weights.repeat_interleave(attn.heads, dim=0) + attention_probs = attention_probs * weights[:, None] + attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) - # linear proj - if isinstance(attn.processor, LoRACrossAttnProcessor): - hidden_states = attn.to_out[0]( - hidden_states - ) + lora_scale * attn.processor.to_out_lora(hidden_states) - else: - hidden_states = attn.to_out[0](hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + if isinstance(attn.processor, LoRACrossAttnProcessor): + hidden_states = attn.to_out[0]( + hidden_states + ) + lora_scale * attn.processor.to_out_lora(hidden_states) + else: + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) - return hidden_states + return hidden_states class FabricPipeline(DiffusionPipeline): @@ -228,7 +234,6 @@ def initialize_prompts(self, prompts: List[str], device): and self.text_encoder.config.use_attention_mask ) else None - print("Asdfsdf",attention_mask) prompt_embd = self.text_encoder( input_ids=prompt_tokens.input_ids.to(device), @@ -291,6 +296,7 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) weights[:, hidden_states.size(1):] = weight + attn_with_weights = CrossAttnStoreProcessor() out = attn_with_weights( self, cond_hiddens, @@ -352,8 +358,8 @@ def new_forward( return out - def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: - images_t = [self.image_to_tensor(img) for img in images] + def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: + images_t = [self.image_to_tensor(img,dtype) for img in images] images_t = torch.stack(images_t).to(device) latents = ( vae.config.scaling_factor @@ -362,7 +368,7 @@ def preprocess_feedback_images(self, images, vae, device) -> torch.tensor: return latents def decode_latents(self, latents): - warnings.warn( + warnings.warn( "The decode_latents method is deprecated and will be removed in a future version. Plea se" " use VaeImageProcessor instead", FutureWarning, @@ -402,12 +408,13 @@ def __call__( torch.manual_seed(random_seed) device = self._execution_device + dtype = self.text_encoder.dtype - latent_noise = torch.randn(n_images, 4, 64, 64, device=device) - - positive_latents = self.preprocess_feedback_images(liked,self.vae,device) if liked and len(liked)>1 else torch.tensor([], device=device) + latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) + + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, ) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device) if disliked and len(disliked)>0 else torch.tensor([], device=device) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device) if isinstance(prompt, str): prompt = [prompt] * n_images @@ -458,12 +465,12 @@ def __call__( if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): z_ref_noised = ( alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise - ) + ).type(dtype) + print("here") else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) - cached_hidden_states = self.get_unet_hidden_states( z_ref_noised, t, ref_prompt_embd ) @@ -514,7 +521,7 @@ def __call__( return FabricPipelineOutput(imgs,False) @staticmethod - def image_to_tensor(image: Union[str, Image.Image]): + def image_to_tensor(image: Union[str, Image.Image], dtype): """ Convert a PIL image to a torch tensor. """ @@ -525,7 +532,8 @@ def image_to_tensor(image: Union[str, Image.Image]): image = image.resize((512, 512)) image = np.array(image).astype(np.uint8) image = (image / 127.5 - 1.0).astype(np.float32) - return torch.from_numpy(image).permute(2, 0, 1) + image = torch.from_numpy(image).permute(2, 0, 1) + return image.type(dtype) From 99ab1bf88276d4847e22b0f70e48b780fbe2989a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 13 Aug 2023 00:53:41 +0530 Subject: [PATCH 58/98] more attn atuff --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ecec00e64759..e0e9773c9cb2 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -86,7 +86,7 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -class CrossAttnStoreProcessor() +class CrossAttnStoreProcessor(): def __init__(self): self.attntion_probs = None @@ -158,7 +158,6 @@ def __call__( return hidden_states - class FabricPipeline(DiffusionPipeline): def __init__( self, @@ -286,6 +285,7 @@ def unet_forward_with_cached_hidden_states( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): + cached_hs = cached_hiddens.pop(0).to( hidden_states.device ) @@ -296,12 +296,12 @@ def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, wei 1, 1 + cached_hs.shape[1] // hidden_states.size(1) ) weights[:, hidden_states.size(1):] = weight + print(self) attn_with_weights = CrossAttnStoreProcessor() - out = attn_with_weights( - self, + self.unet.set_attn_processor(attn_with_weights) + out = self.unet( cond_hiddens, encoder_hidden_states=cond_hs, - weights=weights, ) return out @@ -537,3 +537,4 @@ def image_to_tensor(image: Union[str, Image.Image], dtype): + From 9f9810ed8aff896f77daba5814679ad8634dcfeb Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 15 Aug 2023 23:01:00 +0530 Subject: [PATCH 59/98] stupid additions --- .../pipelines/fabric/pipeline_fabric.py | 88 +++++++++++++------ src/test.py | 17 ++++ 2 files changed, 78 insertions(+), 27 deletions(-) create mode 100644 src/test.py diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index e0e9773c9cb2..47f3efede0c1 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -86,7 +86,7 @@ def apply_unet_lora_weights(pipeline, unet_path): unet.load_state_dict(model_weight, strict=False) -class CrossAttnStoreProcessor(): +class CrossAttnProcessor(): def __init__(self): self.attntion_probs = None @@ -284,26 +284,6 @@ def unet_forward_with_cached_hidden_states( local_neg_weights = torch.linspace( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - def new_forward_caching(module, hidden_states, cond_hiddens, cached_hiddens, weight, weights): - - cached_hs = cached_hiddens.pop(0).to( - hidden_states.device - ) - cond_hs = torch.cat( - [cond_hiddens, cached_hs], dim=1 - ) - weights = weights.clone().repeat( - 1, 1 + cached_hs.shape[1] // hidden_states.size(1) - ) - weights[:, hidden_states.size(1):] = weight - print(self) - attn_with_weights = CrossAttnStoreProcessor() - self.unet.set_attn_processor(attn_with_weights) - out = self.unet( - cond_hiddens, - encoder_hidden_states=cond_hs, - ) - return out for block, pos_weight, neg_weight in zip( @@ -332,15 +312,69 @@ def new_forward( out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) + def new_forward_caching(self, hidden_states, cond_hiddens, cached_hiddens, weight, weights): + + cached_hs = cached_hiddens.pop(0).to( + hidden_states.device + ) + cond_hs = torch.cat( + [cond_hiddens, cached_hs], dim=1 + ) + weights = weights.clone().repeat( + 1, 1 + cached_hs.shape[1] // hidden_states.size(1) + ) + weights[:, hidden_states.size(1):] = weight + print(self) + attn_with_weights = CrossAttnStoreProcessor() + out = self.attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_hs, + weights=weights + ) + return out + if cached_pos_hiddens is not None: - out_pos = new_forward_caching( - self, hidden_states, cond_hiddens, cached_pos_hiddens, pos_weight, - weights) + cached_pos_hs = cached_pos_hiddens.pop(0).to( + hidden_states.device + ) + cond_pos_hs = torch.cat( + [cond_hiddens, cached_pos_hs], dim=1 + ) + pos_weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + pos_weights[:, d_model:] = pos_weight + attn_with_weights = CrossAttnProcessor() + out_pos = attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_pos_hs, + weights=pos_weights, + ) + else: + out_pos = self.old_forward(cond_hiddens) if cached_neg_hiddens is not None: - out_neg = new_forward_caching( - self, hidden_states, uncond_hiddens, cached_neg_hiddens, - neg_weight, weights) + cached_neg_hs = cached_neg_hiddens.pop(0).to( + hidden_states.device + ) + uncond_neg_hs = torch.cat( + [uncond_hiddens, cached_neg_hs], dim=1 + ) + neg_weights = weights.clone().repeat( + 1, 1 + cached_neg_hs.shape[1] // d_model + ) + neg_weights[:, d_model:] = neg_weight + attn_with_weights = CrossAttnProcessor() + out_neg = attn_with_weights( + self, + uncond_hiddens, + encoder_hidden_states=uncond_neg_hs, + weights=neg_weights, + ) + else: + out_neg = self.old_forward(uncond_hiddens) out = torch.cat([out_pos, out_neg], dim=0) return out diff --git a/src/test.py b/src/test.py new file mode 100644 index 000000000000..67943eaec8e4 --- /dev/null +++ b/src/test.py @@ -0,0 +1,17 @@ +from diffusers import FabricPipeline +import torch + + +model_id = "dreamlike-art/dreamlike-photoreal-2.0" +pipe = FabricPipeline.from_pretrained(model_id,torch_dtype=torch.float32) +#pipe = pipe.to("cuda") +prompt = "photo, naked women fingering in her ass, no cloths, big boobs" +neg_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality" +liked = ["../../transformers/src/test.jpg"] +disliked = ["../../transformers/src/test.jpg"] +image = pipe(prompt, negative_prompt = neg_prompt, liked=liked, disliked=disliked) +for i, im in enumerate(image.images): + im.save(f"{time.time()}_{i}.jpg") + + + From 4ea0d5028ebba0d1111f887cd5d64f7c8cd59572 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 15 Aug 2023 23:51:57 +0530 Subject: [PATCH 60/98] documenting and testing --- .../pipelines/fabric/pipeline_fabric.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 47f3efede0c1..9095c72616fc 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -419,17 +419,17 @@ def decode_latents(self, latents): def __call__( self, prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", liked: Optional[List[Image.Image]] = [], disliked: Optional[List[Image.Image]] = [], - random_seed: int = 42, - n_images: int = 1, - guidance_scale: float = 8.0, + random_seed: int = 37, + n_images: int = 4, + guidance_scale: float = 7.0, denoising_steps: int = 20, feedback_start_ratio: float = 0.33, feedback_end_ratio: float = 0.66, - min_weight: float = 0.1, - max_weight: float = 1.0, + min_weight: float = 0.05, + max_weight: float = .8, neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, @@ -446,9 +446,9 @@ def __call__( latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) - positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, ) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device) + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) if isinstance(prompt, str): prompt = [prompt] * n_images From ed90191d91e22e112d45e76bde81ebf66527a8c7 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 00:11:52 +0530 Subject: [PATCH 61/98] writing tests --- tests/pipelines/fabric/__init__.py | 0 tests/pipelines/fabric/test_fabric.py | 147 ++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 tests/pipelines/fabric/__init__.py create mode 100644 tests/pipelines/fabric/test_fabric.py diff --git a/tests/pipelines/fabric/__init__.py b/tests/pipelines/fabric/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py new file mode 100644 index 000000000000..06d9f53b6b6b --- /dev/null +++ b/tests/pipelines/fabric/test_fabric.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch + +from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel +from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..pipeline_params import ( + CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, + CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = DiTPipeline + params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - { + "latents", + "num_images_per_prompt", + "callback", + "callback_steps", + } + batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + transformer = Transformer2DModel( + sample_size=16, + num_layers=2, + patch_size=4, + attention_head_dim=8, + num_attention_heads=2, + in_channels=4, + out_channels=8, + attention_bias=True, + activation_fn="gelu-approximate", + num_embeds_ada_norm=1000, + norm_type="ada_norm_zero", + norm_elementwise_affine=False, + ) + vae = AutoencoderKL() + scheduler = DDIMScheduler() + components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler} + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "class_labels": [1], + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + self.assertEqual(image.shape, (1, 16, 16, 3)) + expected_slice = np.array([0.2946, 0.6601, 0.4329, 0.3296, 0.4144, 0.5319, 0.7273, 0.5013, 0.4457]) + max_diff = np.abs(image_slice.flatten() - expected_slice).max() + self.assertLessEqual(max_diff, 1e-3) + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) + + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + + +@require_torch_gpu +@slow +class FABRICPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_fabric(self): + generator = torch.manual_seed(0) + + pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe.to("cuda") + + prompt = "white wolf holding an umbrella" + + images = pipe(prompt, random_seed=generator).images + + for word, image in zip(prompt, images): + expected_image = load_numpy( + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + ) + assert np.abs((expected_image - image).max()) < 1e-2 + + def test_fabric_feedback(self): + generator = torch.manual_seed(0) + + pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe.to("cuda") + + prompt = "white wolf holding an umbrella" + + images = pipe(prompt, random_seed=generator).images + + for word, image in zip(prompt, images): + expected_image = load_numpy( + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + ) + assert np.abs((expected_image - image).max()) < 1e-2 From 7513416c2cdd5794dc360698b3407dcf9881908b Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:49:44 +0530 Subject: [PATCH 62/98] more docs --- .../pipelines/fabric/pipeline_fabric.py | 160 +++++++++++++----- 1 file changed, 117 insertions(+), 43 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 9095c72616fc..7ff9213c55cb 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -26,11 +26,9 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( deprecate, is_accelerate_available, @@ -38,17 +36,16 @@ logging, randn_tensor, replace_example_docstring, + BaseOutput, ) from ...configuration_utils import ConfigMixin, register_to_config -from ...utils import BaseOutput, logging from ...models.cross_attention import LoRACrossAttnProcessor from ...models.attention import BasicTransformerBlock -from ..stable_diffusion import StableDiffusionPipeline from ...schedulers import EulerAncestralDiscreteScheduler from . import FabricPipelineOutput -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -159,6 +156,31 @@ def __call__( return hidden_states class FabricPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion and conditioning the results + using feedback images. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`EulerAncestralDiscreteScheduler`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + """ def __init__( self, vae: AutoencoderKL, @@ -166,38 +188,58 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: EulerAncestralDiscreteScheduler, - lora_weights: Optional[str] = None, + safety_checker: StableDiffusionSafetyChecker, + requires_safety_checker:bool = True, ): super().__init__() + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want + to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_che + cker=None'` instead." + ) - #if stable_diffusion_version == "2.1": - # warnings.warn("StableDiffusion v2.x is not supported and may give unexpected results.") - - #if model_name is None: - # if stable_diffusion_version == "1.5": - # model_name = "runwayml/stable-diffusion-v1-5" - # elif stable_diffusion_version == "2.1": - # model_name = "stabilityai/stable-diffusion-2-1" - # else: - # raise ValueError( - # f"Unknown stable diffusion version: {stable_diffusion_version}. Version must be either '1.5' or '2.1'" - # ) - - #scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler") - - # pipe = StableDiffusionPipeline.from_pretrained( - # model_name, - # scheduler=scheduler, - # torch_dtype=torch_dtype, - # safety_checker=None, - # ).to("cuda") - - if lora_weights: - print(f"Applying LoRA weights from {lora_weights}") - apply_unet_lora_weights( - pipeline=pipe, unet_path=lora_weights + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller t + han" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of an + y of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \ + n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/sta + ble-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 6 + 4 in the" + " configuration file. Please make sure to update the config accordingly as leaving + `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have do + wnloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a + Pull request for" + " the `unet/config.json` file" ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + self.register_modules( unet = unet, vae = vae, @@ -208,13 +250,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - #@property - #def device(self): - # return next(self.parameters()).device - - #def to(self, device): - # self.pipeline.to(device) - # return super().to(device) def initialize_prompts(self, prompts: List[str], device): # Breaking into individual prompts feels memory efficient @@ -420,8 +455,8 @@ def __call__( self, prompt: Optional[Union[str, List[str]]] = "", negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[List[Image.Image]] = [], - disliked: Optional[List[Image.Image]] = [], + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], random_seed: int = 37, n_images: int = 4, guidance_scale: float = 7.0, @@ -434,11 +469,50 @@ def __call__( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Liked enables feedback through images, encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Disliked enables feedback through images, discourages images with disliked features. + random_seed (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. + to make generation deterministic. + n_images (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + denoising_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ """ Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. """ - if random_seed is not None: + if random_seed is not None and random_seed is not torch.Generator: torch.manual_seed(random_seed) device = self._execution_device @@ -557,7 +631,7 @@ def __call__( @staticmethod def image_to_tensor(image: Union[str, Image.Image], dtype): """ - Convert a PIL image to a torch tensor. + Convert latent PIL image to a torch tensor for further processing. """ if isinstance(image, str): image = Image.open(image) From 3a7a783589ffcfd22a40d81424f15d7b785c3a78 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:50:56 +0530 Subject: [PATCH 63/98] tests and docs --- .../pipelines/fabric/pipeline_fabric.py | 33 ---- tests/pipelines/fabric/test_fabric.py | 163 ++++++++++++------ 2 files changed, 111 insertions(+), 85 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 7ff9213c55cb..afc11c70d194 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -50,39 +50,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -def apply_unet_lora_weights(pipeline, unet_path): - model_weight = torch.load(unet_path, map_location="cpu") - unet = pipeline.unet - lora_attn_procs = {} - lora_rank = list( - set([v.size(0) for k, v in model_weight.items() if k.endswith("down.weight")]) - ) - assert len(lora_rank) == 1 - lora_rank = lora_rank[0] - for name in unet.attn_processors.keys(): - cross_attention_dim = ( - None - if name.endswith("attn1.processor") - else unet.config.cross_attention_dim - ) - if name.startswith("mid_block"): - hidden_size = unet.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(unet.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = unet.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRACrossAttnProcessor( - hidden_size=hidden_size, - cross_attention_dim=cross_attention_dim, - rank=lora_rank, - ).to(pipeline.device) - unet.set_attn_processor(lora_attn_procs) - unet.load_state_dict(model_weight, strict=False) - - class CrossAttnProcessor(): def __init__(self): self.attntion_probs = None diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 06d9f53b6b6b..0783fa5b890d 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -33,36 +33,61 @@ enable_full_determinism() -class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DiTPipeline - params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents", - "num_images_per_prompt", - "callback", - "callback_steps", - } - batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS +class FabricPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): + pipeline_class = FabricDiffusionPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) - transformer = Transformer2DModel( - sample_size=16, - num_layers=2, - patch_size=4, - attention_head_dim=8, - num_attention_heads=2, + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, in_channels=4, - out_channels=8, - attention_bias=True, - activation_fn="gelu-approximate", - num_embeds_ada_norm=1000, - norm_type="ada_norm_zero", - norm_elementwise_affine=False, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, ) - vae = AutoencoderKL() - scheduler = DDIMScheduler() - components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler} + scheduler = EulerAncestralDiscreteScheduler() + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } return components def get_dummy_inputs(self, device, seed=0): @@ -71,39 +96,71 @@ def get_dummy_inputs(self, device, seed=0): else: generator = torch.Generator(device=device).manual_seed(seed) inputs = { - "class_labels": [1], - "generator": generator, - "num_inference_steps": 2, - "output_type": "numpy", + "prompt": "A painting of a squirrel eating a burger", + "random_ssed": generator, + "num_images": 1, } return inputs - def test_inference(self): - device = "cpu" + def test_stable_diffusion_ddim(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) + sd_pipe = FabricPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images + output = sd_pipe(**inputs) + image = output.images + image_slice = image[0, -3:, -3:, -1] - self.assertEqual(image.shape, (1, 16, 16, 3)) - expected_slice = np.array([0.2946, 0.6601, 0.4329, 0.3296, 0.4144, 0.5319, 0.7273, 0.5013, 0.4457]) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864]) - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + + def test_stable_diffusion_negative_prompt_embeds(self): + components = self.get_dummy_components() + sd_pipe = FabricPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + negative_prompt = 3 * ["this is a negative prompt"] + inputs["negative_prompt"] = negative_prompt + inputs["prompt"] = 3 * [inputs["prompt"]] + + # forward + output = sd_pipe(**inputs) + image_slice_1 = output.images[0, -3:, -3:, -1] + + inputs = self.get_dummy_inputs(torch_device) + prompt = 3 * [inputs.pop("prompt")] + + embeds = [] + for p in [prompt, negative_prompt]: + text_inputs = sd_pipe.tokenizer( + p, + padding="max_length", + max_length=sd_pipe.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_inputs = text_inputs["input_ids"].to(torch_device) + + embeds.append(sd_pipe.text_encoder(text_inputs)[0]) + + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds + + # forward + output = sd_pipe(**inputs) + image_slice_2 = output.images[0, -3:, -3:, -1] + + assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 @require_torch_gpu @@ -120,13 +177,13 @@ def test_fabric(self): pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") - prompt = "white wolf holding an umbrella" + prompt = "a photograph of an astronaut riding a horse" images = pipe(prompt, random_seed=generator).images for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" ) assert np.abs((expected_image - image).max()) < 1e-2 @@ -136,12 +193,14 @@ def test_fabric_feedback(self): pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") - prompt = "white wolf holding an umbrella" - + prompt = "a photograph of an astronaut riding a horse" images = pipe(prompt, random_seed=generator).images + liked = [images] + images = pipe(prompt, random_seed=generator, liked=liked).images + for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" + f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" ) assert np.abs((expected_image - image).max()) < 1e-2 From 897f0679e6c24a22217e48a95bd484b85a13de3e Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 01:51:14 +0530 Subject: [PATCH 64/98] remove test --- src/test.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 src/test.py diff --git a/src/test.py b/src/test.py deleted file mode 100644 index 67943eaec8e4..000000000000 --- a/src/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from diffusers import FabricPipeline -import torch - - -model_id = "dreamlike-art/dreamlike-photoreal-2.0" -pipe = FabricPipeline.from_pretrained(model_id,torch_dtype=torch.float32) -#pipe = pipe.to("cuda") -prompt = "photo, naked women fingering in her ass, no cloths, big boobs" -neg_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality" -liked = ["../../transformers/src/test.jpg"] -disliked = ["../../transformers/src/test.jpg"] -image = pipe(prompt, negative_prompt = neg_prompt, liked=liked, disliked=disliked) -for i, im in enumerate(image.images): - im.save(f"{time.time()}_{i}.jpg") - - - From 3bc14e3e3255b30ba7184e0a7abba37a52f39d5d Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 16 Aug 2023 22:42:54 +0530 Subject: [PATCH 65/98] change cross attention --- .../pipeline_stable_diffusion.py | 1023 +++++++---------- 1 file changed, 443 insertions(+), 580 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 9bc2ad57fdcc..b707a48789e8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -1,3 +1,5 @@ +### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that +### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,20 +13,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union +from packaging import version -import inspect +import torch +from torch import nn +from torch.nn import functional as F +from PIL import Image +import numpy as np import warnings -from typing import Any, Callable, Dict, List, Optional, Union +from tqdm import tqdm -import torch -from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( deprecate, is_accelerate_available, @@ -32,133 +37,129 @@ logging, randn_tensor, replace_example_docstring, + BaseOutput, ) + +from ...configuration_utils import ConfigMixin, register_to_config +from ...models.attention_processor import LoRAAttnProcessor +from ...models.attention import BasicTransformerBlock +from ...schedulers import EulerAncestralDiscreteScheduler +from . import FabricPipelineOutput + from ..pipeline_utils import DiffusionPipeline -from . import StableDiffusionPipelineOutput -from .safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__) # pylint: disable=invalid-name -EXAMPLE_DOC_STRING = """ - Examples: - ```py - >>> import torch - >>> from diffusers import StableDiffusionPipeline +class CrossAttnProcessor(): + def __init__(self): + self.attntion_probs = None - >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) - >>> pipe = pipe.to("cuda") + def __call__( + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + weights=None, # shape: (batch_size, sequence_length) + lora_scale=1.0, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size + ) - >>> prompt = "a photo of an astronaut riding a horse on mars" - >>> image = pipe(prompt).images[0] - ``` -""" + if isinstance(attn.processor, LoRAAttnProcessor): + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( + hidden_states + ) + else: + query = attn.to_q(hidden_states) + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) -def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): - """ - Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 - """ - std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) - std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) - # rescale the results from guidance (fixes overexposure) - noise_pred_rescaled = noise_cfg * (std_text / std_cfg) - # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images - noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg - return noise_cfg + if isinstance(attn.processor, LoRAAttnProcessor): + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( + encoder_hidden_states + ) + value = attn.to_v( + encoder_hidden_states + ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) -class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin): - r""" - Pipeline for text-to-image generation using Stable Diffusion. + attention_probs = attn.get_attention_scores(query, key, attention_mask) + + if weights is not None: + if weights.shape[0] != 1: + weights = weights.repeat_interleave(attn.heads, dim=0) + attention_probs = attention_probs * weights[:, None] + attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods - implemented for all pipelines (downloading, saving, running on a particular device, etc.). + # linear proj + if isinstance(attn.processor, LoRAAttnProcessor): + hidden_states = attn.to_out[0]( + hidden_states + ) + lora_scale * attn.processor.to_out_lora(hidden_states) + else: + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + +class FabricPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion and conditioning the results + using feedback images. - The pipeline also inherits the following loading methods: - - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights - - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Args: vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. - text_encoder ([`~transformers.CLIPTextModel`]): - Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). - tokenizer ([`~transformers.CLIPTokenizer`]): - A `CLIPTokenizer` to tokenize text. - unet ([`UNet2DConditionModel`]): - A `UNet2DConditionModel` to denoise the encoded image latents. - scheduler ([`SchedulerMixin`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`EulerAncestralDiscreteScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. - Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details - about a model's potential harms. - feature_extractor ([`~transformers.CLIPImageProcessor`]): - A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. """ - _optional_components = ["safety_checker", "feature_extractor"] - def __init__( self, vae: AutoencoderKL, text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool = True, + scheduler: EulerAncestralDiscreteScheduler, + requires_safety_checker:bool = True, ): super().__init__() - if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: - deprecation_message = ( - f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" - f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " - "to update the config accordingly as leaving `steps_offset` might led to incorrect results" - " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," - " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file" - ) - deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) - new_config = dict(scheduler.config) - new_config["steps_offset"] = 1 - scheduler._internal_dict = FrozenDict(new_config) - - if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: - deprecation_message = ( - f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." - " `clip_sample` should be set to False in the configuration file. Please make sure to update the" - " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" - " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" - " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" - ) - deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) - new_config = dict(scheduler.config) - new_config["clip_sample"] = False - scheduler._internal_dict = FrozenDict(new_config) - - if safety_checker is None and requires_safety_checker: - logger.warning( - f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" - " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" - " results in services or applications open to the public. Both the diffusers team and Hugging Face" - " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" - " it only for use-cases that involve analyzing network behavior or auditing its results. For more" - " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." - ) - - if safety_checker is not None and feature_extractor is None: - raise ValueError( - "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" - " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." - ) - is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( version.parse(unet.config._diffusers_version).base_version ) < version.parse("0.9.0.dev0") @@ -175,255 +176,207 @@ def __init__( " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" " the `unet/config.json` file" ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) self.register_modules( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, + unet = unet, + vae = vae, + text_encoder = text_encoder, + tokenizer = tokenizer, + scheduler = scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() + def initialize_prompts(self, prompts: List[str], device): + # Breaking into individual prompts feels memory efficient + prompt_embed_list = [] + for prompt in prompts: + prompt_tokens = self.tokenizer( + prompt, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() + attention_mask = prompt_tokens.attention_mask.to(device) if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ) else None - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a - time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs. - Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the - iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(device), + attention_mask=attention_mask, + ).last_hidden_state - device = torch.device(f"cuda:{gpu_id}") + prompt_embed_list.append(prompt_embd) - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + return torch.cat(prompt_embed_list, dim=0) - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + def get_unet_hidden_states(self, z_all, t, prompt_embd): + cached_hidden_states = [] + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + def new_forward(self, hidden_states, *args, **kwargs): + cached_hidden_states.append(hidden_states.clone().detach().cpu()) + return self.old_forward(hidden_states, *args, **kwargs) - # We'll offload the last model manually. - self.final_offload_hook = hook + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) - def _encode_prompt( - self, - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - lora_scale: Optional[float] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. + # run forward pass to cache hidden states, output can be discarded + _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - device: (`torch.device`): - torch device - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - """ - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): - self._lora_scale = lora_scale - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # textual inversion: procecss multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - prompt = self.maybe_convert_prompt(prompt, self.tokenizer) - - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( - text_input_ids, untruncated_ids - ): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - attention_mask = text_inputs.attention_mask.to(device) - else: - attention_mask = None - - prompt_embeds = self.text_encoder( - text_input_ids.to(device), - attention_mask=attention_mask, - ) - prompt_embeds = prompt_embeds[0] + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward - if self.text_encoder is not None: - prompt_embeds_dtype = self.text_encoder.dtype - elif self.unet is not None: - prompt_embeds_dtype = self.unet.dtype - else: - prompt_embeds_dtype = prompt_embeds.dtype - - prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - # textual inversion: procecss multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - attention_mask = uncond_input.attention_mask.to(device) - else: - attention_mask = None + return cached_hidden_states - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids.to(device), - attention_mask=attention_mask, - ) - negative_prompt_embeds = negative_prompt_embeds[0] - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + def unet_forward_with_cached_hidden_states( + self, + z_all, + t, + prompt_embd, + cached_pos_hiddens: Optional[List[torch.Tensor]] = None, + cached_neg_hiddens: Optional[List[torch.Tensor]] = None, + pos_weights=(0.8, 0.8), + neg_weights=(0.5, 0.5), + ): + if cached_pos_hiddens is None and cached_neg_hiddens is None: + return self.unet(z_all, t, encoder_hidden_states=prompt_embd) - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + local_pos_weights = torch.linspace( + *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() + local_neg_weights = torch.linspace( + *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - return prompt_embeds - def run_safety_checker(self, image, device, dtype): - if self.safety_checker is None: - has_nsfw_concept = None - else: - if torch.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - return image, has_nsfw_concept + for block, pos_weight, neg_weight in zip( + self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, + local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], + local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1], + ): + for module in block.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward( + self, + hidden_states, + pos_weight=pos_weight, + neg_weight=neg_weight, + **kwargs, + ): + cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0) + batch_size, d_model = cond_hiddens.shape[:2] + device, dtype = hidden_states.device, hidden_states.dtype + + weights = torch.ones( + batch_size, d_model, device=device, dtype=dtype + ) + + out_pos = self.old_forward(hidden_states) + out_neg = self.old_forward(hidden_states) + + def new_forward_caching(self, hidden_states, cond_hiddens, cached_hiddens, weight, weights): + + cached_hs = cached_hiddens.pop(0).to( + hidden_states.device + ) + cond_hs = torch.cat( + [cond_hiddens, cached_hs], dim=1 + ) + weights = weights.clone().repeat( + 1, 1 + cached_hs.shape[1] // hidden_states.size(1) + ) + weights[:, hidden_states.size(1):] = weight + print(self) + attn_with_weights = CrossAttnStoreProcessor() + out = self.attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_hs, + weights=weights + ) + return out + + if cached_pos_hiddens is not None: + cached_pos_hs = cached_pos_hiddens.pop(0).to( + hidden_states.device + ) + cond_pos_hs = torch.cat( + [cond_hiddens, cached_pos_hs], dim=1 + ) + pos_weights = weights.clone().repeat( + 1, 1 + cached_pos_hs.shape[1] // d_model + ) + pos_weights[:, d_model:] = pos_weight + attn_with_weights = CrossAttnProcessor() + out_pos = attn_with_weights( + self, + cond_hiddens, + encoder_hidden_states=cond_pos_hs, + weights=pos_weights, + ) + else: + out_pos = self.old_forward(cond_hiddens) + + if cached_neg_hiddens is not None: + cached_neg_hs = cached_neg_hiddens.pop(0).to( + hidden_states.device + ) + uncond_neg_hs = torch.cat( + [uncond_hiddens, cached_neg_hs], dim=1 + ) + neg_weights = weights.clone().repeat( + 1, 1 + cached_neg_hs.shape[1] // d_model + ) + neg_weights[:, d_model:] = neg_weight + attn_with_weights = CrossAttnProcessor() + out_neg = attn_with_weights( + self, + uncond_hiddens, + encoder_hidden_states=uncond_neg_hs, + weights=neg_weights, + ) + else: + out_neg = self.old_forward(uncond_hiddens) + + out = torch.cat([out_pos, out_neg], dim=0) + return out + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + out = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return out + + def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: + images_t = [self.image_to_tensor(img,dtype) for img in images] + images_t = torch.stack(images_t).to(device) + latents = ( + vae.config.scaling_factor + * vae.encode(images_t).latent_dist.sample() + ) + return latents def decode_latents(self, latents): warnings.warn( @@ -434,291 +387,201 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents, return_dict=False)[0] image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - - def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( self, - prompt: Union[str, List[str]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + random_seed: int = 37, + n_images: int = 4, + guidance_scale: float = 7.0, + denoising_steps: int = 20, + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = .8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, ): r""" - The call function to the pipeline for generation. + Function invoked when calling the pipeline for generation. Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Liked enables feedback through images, encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Disliked enables feedback through images, discourages images with disliked features. + random_seed (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. + to make generation deterministic. + n_images (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (`Callable`, *optional*): - A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function is called. If not specified, the callback is called at - every step. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.7): - Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when - using zero terminal SNR. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + denoising_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. """ - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor + """ + Generate a trajectory of images with binary feedback. + The feedback can be given as a list of liked and disliked images. + """ + if random_seed is not None and random_seed is not torch.Generator: + torch.manual_seed(random_seed) - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) + device = self._execution_device + dtype = self.text_encoder.dtype - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] + latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None - ) - prompt_embeds = self._encode_prompt( - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, - ) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps + negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) - # 5. Prepare latent variables - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) + if isinstance(prompt, str): + prompt = [prompt] * n_images + else: + assert len(prompt) == n_images + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] * n_images + else: + assert len(negative_prompt) == n_images - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, - )[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - if do_classifier_free_guidance and guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if not output_type == "latent": - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = latents - has_nsfw_concept = None + batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + self.scheduler.set_timesteps(denoising_steps, device=device) + timesteps = self.scheduler.timesteps - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + latent_noise = latent_noise * self.scheduler.init_noise_sigma - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() + num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order - if not return_dict: - return (image, has_nsfw_concept) + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with tqdm(total=denoising_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = ( + alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise + ).type(dtype) + print("here") + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) + cached_hidden_states = self.get_unet_hidden_states( + z_ref_noised, t, ref_prompt_embd + ) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view( + 1, -1, *cached_pos.shape[2:] + ).expand(n_images, -1, -1) + cached_neg = cached_neg.view( + 1, -1, *cached_neg.shape[2:] + ).expand(n_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None + + unet_out = self.unet_forward_with_cached_hidden_states( + z_all, + t, + prompt_embd=batched_prompt_embd, + cached_pos_hiddens=cached_pos_hs, + cached_neg_hiddens=cached_neg_hs, + pos_weights=pos_ws, + neg_weights=neg_ws, + ).sample + + noise_cond, noise_uncond = unet_out.chunk(2) + guidance = noise_cond - noise_uncond + noise_pred = noise_uncond + guidance_scale * guidance + latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample + + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + pbar.update() + + y = self.decode_latents(latent_noise) + imgs = self.image_processor.numpy_to_pil(y) + + return FabricPipelineOutput(imgs,False) + + @staticmethod + def image_to_tensor(image: Union[str, Image.Image], dtype): + """ + Convert latent PIL image to a torch tensor for further processing. + """ + if isinstance(image, str): + image = Image.open(image) + if not image.mode == "RGB": + image = image.convert("RGB") + image = image.resize((512, 512)) + image = np.array(image).astype(np.uint8) + image = (image / 127.5 - 1.0).astype(np.float32) + image = torch.from_numpy(image).permute(2, 0, 1) + return image.type(dtype) - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From dd6e9b23020eff18845deeebfc43863619ec6278 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 17 Aug 2023 12:45:01 +0530 Subject: [PATCH 66/98] revert back --- .../pipeline_stable_diffusion.py | 957 ++++++++++-------- 1 file changed, 530 insertions(+), 427 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index b707a48789e8..ce69b83dad68 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -1,5 +1,3 @@ -### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that -### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,23 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union -from packaging import version -import torch -from torch import nn -from torch.nn import functional as F -from PIL import Image -import numpy as np -import warnings -from tqdm import tqdm +import inspect +from typing import Any, Callable, Dict, List, Optional, Union -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +import torch +from packaging import version +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( deprecate, is_accelerate_available, @@ -37,96 +29,32 @@ logging, randn_tensor, replace_example_docstring, - BaseOutput, ) - -from ...configuration_utils import ConfigMixin, register_to_config -from ...models.attention_processor import LoRAAttnProcessor -from ...models.attention import BasicTransformerBlock -from ...schedulers import EulerAncestralDiscreteScheduler -from . import FabricPipelineOutput - from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class CrossAttnProcessor(): - def __init__(self): - self.attntion_probs = None - - def __call__( - self, - attn, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - weights=None, # shape: (batch_size, sequence_length) - lora_scale=1.0, - ): - batch_size, sequence_length, _ = ( - hidden_states.shape - if encoder_hidden_states is None - else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size - ) - - if isinstance(attn.processor, LoRAAttnProcessor): - query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( - hidden_states - ) - else: - query = attn.to_q(hidden_states) - - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline - if isinstance(attn.processor, LoRAAttnProcessor): - key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( - encoder_hidden_states - ) - value = attn.to_v( - encoder_hidden_states - ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) - else: - key = attn.to_k(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) - - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) - - attention_probs = attn.get_attention_scores(query, key, attention_mask) + >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") - if weights is not None: - if weights.shape[0] != 1: - weights = weights.repeat_interleave(attn.heads, dim=0) - attention_probs = attention_probs * weights[:, None] - attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True) + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) - # linear proj - if isinstance(attn.processor, LoRAAttnProcessor): - hidden_states = attn.to_out[0]( - hidden_states - ) + lora_scale * attn.processor.to_out_lora(hidden_states) - else: - hidden_states = attn.to_out[0](hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) - - return hidden_states - -class FabricPipeline(DiffusionPipeline): +class StableDiffusionPipeline(DiffusionPipeline): r""" - Pipeline for text-to-image generation using Stable Diffusion and conditioning the results - using feedback images. + Pipeline for text-to-image generation using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) @@ -142,24 +70,73 @@ class FabricPipeline(DiffusionPipeline): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. - scheduler ([`EulerAncestralDiscreteScheduler`]): + scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ + _optional_components = ["safety_checker", "feature_extractor"] + def __init__( self, vae: AutoencoderKL, text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, - scheduler: EulerAncestralDiscreteScheduler, - requires_safety_checker:bool = True, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, ): super().__init__() + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( version.parse(unet.config._diffusers_version).base_version ) < version.parse("0.9.0.dev0") @@ -176,240 +153,360 @@ def __init__( " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" " the `unet/config.json` file" ) - deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) self.register_modules( - unet = unet, - vae = vae, - text_encoder = text_encoder, - tokenizer = tokenizer, - scheduler = scheduler, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. - def initialize_prompts(self, prompts: List[str], device): - # Breaking into individual prompts feels memory efficient - prompt_embed_list = [] - for prompt in prompts: - prompt_tokens = self.tokenizer( - prompt, - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - attention_mask = prompt_tokens.attention_mask.to(device) if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ) else None + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() - prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(device), - attention_mask=attention_mask, - ).last_hidden_state + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - prompt_embed_list.append(prompt_embd) + device = torch.device(f"cuda:{gpu_id}") - return torch.cat(prompt_embed_list, dim=0) + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) - def get_unet_hidden_states(self, z_all, t, prompt_embd): - cached_hidden_states = [] - for module in self.unet.modules(): - if isinstance(module, BasicTransformerBlock): + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - def new_forward(self, hidden_states, *args, **kwargs): - cached_hidden_states.append(hidden_states.clone().detach().cpu()) - return self.old_forward(hidden_states, *args, **kwargs) + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") - module.attn1.old_forward = module.attn1.forward - module.attn1.forward = new_forward.__get__(module.attn1) + device = torch.device(f"cuda:{gpu_id}") - # run forward pass to cache hidden states, output can be discarded - _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - # restore original forward pass - for module in self.unet.modules(): - if isinstance(module, BasicTransformerBlock): - module.attn1.forward = module.attn1.old_forward - del module.attn1.old_forward + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - return cached_hidden_states + # We'll offload the last model manually. + self.final_offload_hook = hook - def unet_forward_with_cached_hidden_states( + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( self, - z_all, - t, - prompt_embd, - cached_pos_hiddens: Optional[List[torch.Tensor]] = None, - cached_neg_hiddens: Optional[List[torch.Tensor]] = None, - pos_weights=(0.8, 0.8), - neg_weights=(0.5, 0.5), + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, ): - if cached_pos_hiddens is None and cached_neg_hiddens is None: - return self.unet(z_all, t, encoder_hidden_states=prompt_embd) + r""" + Encodes the prompt into text encoder hidden states. - local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] - for block, pos_weight, neg_weight in zip( - self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, - local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], - local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1], - ): - for module in block.modules(): - if isinstance(module, BasicTransformerBlock): - - def new_forward( - self, - hidden_states, - pos_weight=pos_weight, - neg_weight=neg_weight, - **kwargs, - ): - cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0) - batch_size, d_model = cond_hiddens.shape[:2] - device, dtype = hidden_states.device, hidden_states.dtype - - weights = torch.ones( - batch_size, d_model, device=device, dtype=dtype - ) - - out_pos = self.old_forward(hidden_states) - out_neg = self.old_forward(hidden_states) - - def new_forward_caching(self, hidden_states, cond_hiddens, cached_hiddens, weight, weights): - - cached_hs = cached_hiddens.pop(0).to( - hidden_states.device - ) - cond_hs = torch.cat( - [cond_hiddens, cached_hs], dim=1 - ) - weights = weights.clone().repeat( - 1, 1 + cached_hs.shape[1] // hidden_states.size(1) - ) - weights[:, hidden_states.size(1):] = weight - print(self) - attn_with_weights = CrossAttnStoreProcessor() - out = self.attn_with_weights( - self, - cond_hiddens, - encoder_hidden_states=cond_hs, - weights=weights - ) - return out - - if cached_pos_hiddens is not None: - cached_pos_hs = cached_pos_hiddens.pop(0).to( - hidden_states.device - ) - cond_pos_hs = torch.cat( - [cond_hiddens, cached_pos_hs], dim=1 - ) - pos_weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model - ) - pos_weights[:, d_model:] = pos_weight - attn_with_weights = CrossAttnProcessor() - out_pos = attn_with_weights( - self, - cond_hiddens, - encoder_hidden_states=cond_pos_hs, - weights=pos_weights, - ) - else: - out_pos = self.old_forward(cond_hiddens) - - if cached_neg_hiddens is not None: - cached_neg_hs = cached_neg_hiddens.pop(0).to( - hidden_states.device - ) - uncond_neg_hs = torch.cat( - [uncond_hiddens, cached_neg_hs], dim=1 - ) - neg_weights = weights.clone().repeat( - 1, 1 + cached_neg_hs.shape[1] // d_model - ) - neg_weights[:, d_model:] = neg_weight - attn_with_weights = CrossAttnProcessor() - out_neg = attn_with_weights( - self, - uncond_hiddens, - encoder_hidden_states=uncond_neg_hs, - weights=neg_weights, - ) - else: - out_neg = self.old_forward(uncond_hiddens) - - out = torch.cat([out_pos, out_neg], dim=0) - return out - - module.attn1.old_forward = module.attn1.forward - module.attn1.forward = new_forward.__get__(module.attn1) - - out = self.unet(z_all, t, encoder_hidden_states=prompt_embd) - - # restore original forward pass - for module in self.unet.modules(): - if isinstance(module, BasicTransformerBlock): - module.attn1.forward = module.attn1.old_forward - del module.attn1.old_forward - - return out - - def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: - images_t = [self.image_to_tensor(img,dtype) for img in images] - images_t = torch.stack(images_t).to(device) - latents = ( - vae.config.scaling_factor - * vae.encode(images_t).latent_dist.sample() - ) - return latents + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept def decode_latents(self, latents): - warnings.warn( - "The decode_latents method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor instead", - FutureWarning, - ) latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents, return_dict=False)[0] + image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image - @torch.no_grad() + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - random_seed: int = 37, - n_images: int = 4, - guidance_scale: float = 7.0, - denoising_steps: int = 20, - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = .8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -418,28 +515,58 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Liked enables feedback through images, encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Disliked enables feedback through images, discourages images with disliked features. - random_seed (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. - to make generation deterministic. - n_images (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. - denoising_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Examples: @@ -450,138 +577,114 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - """ - Generate a trajectory of images with binary feedback. - The feedback can be given as a list of liked and disliked images. - """ - if random_seed is not None and random_seed is not torch.Generator: - torch.manual_seed(random_seed) - - device = self._execution_device - dtype = self.text_encoder.dtype - - latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor - positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) - - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) - if isinstance(prompt, str): - prompt = [prompt] * n_images + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) else: - assert len(prompt) == n_images - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] * n_images - else: - assert len(negative_prompt) == n_images - + batch_size = prompt_embeds.shape[0] - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) - - batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) - self.scheduler.set_timesteps(denoising_steps, device=device) + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order + # 5. Prepare latent variables + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - with tqdm(total=denoising_steps) as pbar: + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = ( - alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise - ).type(dtype) - print("here") - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) - cached_hidden_states = self.get_unet_hidden_states( - z_ref_noised, t, ref_prompt_embd - ) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view( - 1, -1, *cached_pos.shape[2:] - ).expand(n_images, -1, -1) - cached_neg = cached_neg.view( - 1, -1, *cached_neg.shape[2:] - ).expand(n_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None - - unet_out = self.unet_forward_with_cached_hidden_states( - z_all, + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, - prompt_embd=batched_prompt_embd, - cached_pos_hiddens=cached_pos_hs, - cached_neg_hiddens=cached_neg_hs, - pos_weights=pos_ws, - neg_weights=neg_ws, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, ).sample - noise_cond, noise_uncond = unet_out.chunk(2) - guidance = noise_cond - noise_uncond - noise_pred = noise_uncond + guidance_scale * guidance - latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 - ): - pbar.update() + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - y = self.decode_latents(latent_noise) - imgs = self.image_processor.numpy_to_pil(y) + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) - return FabricPipelineOutput(imgs,False) + if output_type == "latent": + image = latents + has_nsfw_concept = None + elif output_type == "pil": + # 8. Post-processing + image = self.decode_latents(latents) - @staticmethod - def image_to_tensor(image: Union[str, Image.Image], dtype): - """ - Convert latent PIL image to a torch tensor for further processing. - """ - if isinstance(image, str): - image = Image.open(image) - if not image.mode == "RGB": - image = image.convert("RGB") - image = image.resize((512, 512)) - image = np.array(image).astype(np.uint8) - image = (image / 127.5 - 1.0).astype(np.float32) - image = torch.from_numpy(image).permute(2, 0, 1) - return image.type(dtype) + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + else: + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 60e37a00933aad57ca150aa79e843a1d0c2d070e Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 17 Aug 2023 12:52:37 +0530 Subject: [PATCH 67/98] tests --- docs/source/en/api/pipelines/fabric.mdx | 29 +++++ .../api/pipelines/stable_diffusion/fabric.mdx | 50 -------- .../pipelines/fabric/pipeline_fabric.py | 113 +++++------------- src/test.py | 13 ++ tests/pipelines/fabric/test_fabric.py | 12 +- 5 files changed, 80 insertions(+), 137 deletions(-) create mode 100644 docs/source/en/api/pipelines/fabric.mdx delete mode 100644 docs/source/en/api/pipelines/stable_diffusion/fabric.mdx create mode 100644 src/test.py diff --git a/docs/source/en/api/pipelines/fabric.mdx b/docs/source/en/api/pipelines/fabric.mdx new file mode 100644 index 000000000000..5cbe737da994 --- /dev/null +++ b/docs/source/en/api/pipelines/fabric.mdx @@ -0,0 +1,29 @@ +## changes required + + +# Text-to-Image Generation + +## FabricPipeline + +FABRIC is training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models, created by the researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion and finetune them on the basis of feedback. + +The original codebase can be found here: +- *Stable Diffusion V1*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) + +Available Checkpoints are: +- *dreamlike-photoreal-2.0 (512x512 resolution)* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) + +[[autodoc]] FabricPipeline + - all + - __call__ + diff --git a/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx b/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx deleted file mode 100644 index dc4996614ae0..000000000000 --- a/docs/source/en/api/pipelines/stable_diffusion/fabric.mdx +++ /dev/null @@ -1,50 +0,0 @@ -## changes required - - -# Text-to-Image Generation - -## StableDiffusionPipeline - -The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion. - -The original codebase can be found here: -- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) -- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion) - -Available Checkpoints are: -- *stable-diffusion-v1-4 (512x512 resolution)* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) -- *stable-diffusion-v1-5 (512x512 resolution)* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) -- *stable-diffusion-2-base (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) -- *stable-diffusion-2 (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) -- *stable-diffusion-2-1-base (512x512 resolution)* [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) -- *stable-diffusion-2-1 (768x768 resolution)*: [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) - -[[autodoc]] StableDiffusionPipeline - - all - - __call__ - - enable_attention_slicing - - disable_attention_slicing - - enable_vae_slicing - - disable_vae_slicing - - enable_xformers_memory_efficient_attention - - disable_xformers_memory_efficient_attention - - enable_vae_tiling - - disable_vae_tiling - - load_textual_inversion - - from_single_file - - load_lora_weights - - save_lora_weights - -[[autodoc]] FlaxStableDiffusionPipeline - - all - - __call__ diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index afc11c70d194..3616ef51e964 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,4 +1,4 @@ -### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that +### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that ### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # @@ -15,6 +15,7 @@ # limitations under the License. from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union +from packaging import version import torch from torch import nn @@ -40,7 +41,7 @@ ) from ...configuration_utils import ConfigMixin, register_to_config -from ...models.cross_attention import LoRACrossAttnProcessor +from ...models.attention_processor import LoRAAttnProcessor from ...models.attention import BasicTransformerBlock from ...schedulers import EulerAncestralDiscreteScheduler from . import FabricPipelineOutput @@ -63,6 +64,7 @@ def __call__( weights=None, # shape: (batch_size, sequence_length) lora_scale=1.0, ): + print("in") batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None @@ -72,7 +74,7 @@ def __call__( attention_mask, sequence_length, batch_size ) - if isinstance(attn.processor, LoRACrossAttnProcessor): + if isinstance(attn.processor, LoRAAttnProcessor): query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( hidden_states ) @@ -84,7 +86,7 @@ def __call__( elif attn.norm_cross: encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - if isinstance(attn.processor, LoRACrossAttnProcessor): + if isinstance(attn.processor, LoRAAttnProcessor): key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( encoder_hidden_states ) @@ -111,7 +113,7 @@ def __call__( hidden_states = attn.batch_to_head_dim(hidden_states) # linear proj - if isinstance(attn.processor, LoRACrossAttnProcessor): + if isinstance(attn.processor, LoRAAttnProcessor): hidden_states = attn.to_out[0]( hidden_states ) + lora_scale * attn.processor.to_out_lora(hidden_states) @@ -124,7 +126,7 @@ def __call__( class FabricPipeline(DiffusionPipeline): r""" - Pipeline for text-to-image generation using Stable Diffusion and conditioning the results + Pipeline for text-to-image generation using Stable Diffusion and conditioning the results using feedback images. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the @@ -155,27 +157,9 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: EulerAncestralDiscreteScheduler, - safety_checker: StableDiffusionSafetyChecker, requires_safety_checker:bool = True, ): super().__init__() - if safety_checker is None and requires_safety_checker: - logger.warning( - f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" - " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" - " results in services or applications open to the public. Both the diffusers team and Hugging Face" - " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" - " it only for use-cases that involve analyzing network behavior or auditing its results. For more" - " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." - ) - - if safety_checker is not None and feature_extractor is None: - raise ValueError( - "Make sure to define a feature extractor when loading {self.__class__} if you want - to use the safety" - " checker. If you do not want to use the safety checker, you can pass `'safety_che - cker=None'` instead." - ) is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( version.parse(unet.config._diffusers_version).base_version @@ -183,22 +167,14 @@ def __init__( is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( - "The configuration file of the unet has set the default `sample_size` to smaller t - han" - " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of an - y of the" - " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \ - n-" - " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/sta - ble-diffusion-v1-5" - " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 6 - 4 in the" - " configuration file. Please make sure to update the config accordingly as leaving - `sample_size=32`" - " in the config might lead to incorrect results in future versions. If you have do - wnloaded this" - " checkpoint from the Hugging Face Hub, it would be very nice if you could open a - Pull request for" + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" " the `unet/config.json` file" ) @@ -219,7 +195,7 @@ def __init__( def initialize_prompts(self, prompts: List[str], device): - # Breaking into individual prompts feels memory efficient + # Breaking into individual prompts feels memory efficient prompt_embed_list = [] for prompt in prompts: prompt_tokens = self.tokenizer( @@ -240,7 +216,7 @@ def initialize_prompts(self, prompts: List[str], device): input_ids=prompt_tokens.input_ids.to(device), attention_mask=attention_mask, ).last_hidden_state - + prompt_embed_list.append(prompt_embd) return torch.cat(prompt_embed_list, dim=0) @@ -285,9 +261,6 @@ def unet_forward_with_cached_hidden_states( *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() local_neg_weights = torch.linspace( *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - - - for block, pos_weight, neg_weight in zip( self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], @@ -314,28 +287,6 @@ def new_forward( out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) - def new_forward_caching(self, hidden_states, cond_hiddens, cached_hiddens, weight, weights): - - cached_hs = cached_hiddens.pop(0).to( - hidden_states.device - ) - cond_hs = torch.cat( - [cond_hiddens, cached_hs], dim=1 - ) - weights = weights.clone().repeat( - 1, 1 + cached_hs.shape[1] // hidden_states.size(1) - ) - weights[:, hidden_states.size(1):] = weight - print(self) - attn_with_weights = CrossAttnStoreProcessor() - out = self.attn_with_weights( - self, - cond_hiddens, - encoder_hidden_states=cond_hs, - weights=weights - ) - return out - if cached_pos_hiddens is not None: cached_pos_hs = cached_pos_hiddens.pop(0).to( hidden_states.device @@ -405,7 +356,7 @@ def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor def decode_latents(self, latents): warnings.warn( - "The decode_latents method is deprecated and will be removed in a future version. Plea se" + "The decode_latents method is deprecated and will be removed in a future version. Please" " use VaeImageProcessor instead", FutureWarning, ) @@ -437,7 +388,7 @@ def __call__( neg_bottleneck_scale: float = 1.0, ): r""" - Function invoked when calling the pipeline for generation. + Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. Args: prompt (`str` or `List[str]`, *optional*): @@ -467,6 +418,15 @@ def __call__( expense of slower inference. Examples: + >>> from diffusers import FabricPipeline + >>> import torch + >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" + >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) + >>> pipe = pipe.to("cuda") + >>> prompt = "a giant standing in a fantasy landscape best quality" + >>> liked = [] + >>> disliked = [] + >>> image = pipe(prompt, n_images=4, liked=liked,disliked=disliked).images Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -474,21 +434,17 @@ def __call__( When returning a tuple, the first element is a list with the generated images, and the second element is a list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. - """ - """ - Generate a trajectory of images with binary feedback. - The feedback can be given as a list of liked and disliked images. + """ if random_seed is not None and random_seed is not torch.Generator: torch.manual_seed(random_seed) - + device = self._execution_device dtype = self.text_encoder.dtype latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) - - positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) + positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) if isinstance(prompt, str): @@ -500,7 +456,7 @@ def __call__( else: assert len(negative_prompt) == n_images - + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) @@ -610,6 +566,3 @@ def image_to_tensor(image: Union[str, Image.Image], dtype): image = torch.from_numpy(image).permute(2, 0, 1) return image.type(dtype) - - - diff --git a/src/test.py b/src/test.py new file mode 100644 index 000000000000..fd7ba1a1e97d --- /dev/null +++ b/src/test.py @@ -0,0 +1,13 @@ +from diffusers import FabricPipeline +import torch + +model_id = "dreamlike-art/dreamlike-photoreal-2.0" +pipe = FabricPipeline.from_pretrained(model_id, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +prompt = "photo, a church in the middle of a field of crops, bright cinematic lighting, gopro, fisheye lens" +image = pipe(prompt, n_images=1).images[0] + +image.save("result.jpg") + + diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 0783fa5b890d..23eddb475068 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -19,7 +19,7 @@ import numpy as np import torch -from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel +from diffusers import AutoencoderKL, DDIMScheduler, FabricPipeline, DPMSolverMultistepScheduler, Transformer2DModel from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu @@ -85,8 +85,6 @@ def get_dummy_components(self): "vae": vae, "text_encoder": text_encoder, "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, } return components @@ -174,7 +172,7 @@ def tearDown(self): def test_fabric(self): generator = torch.manual_seed(0) - pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" @@ -185,12 +183,12 @@ def test_fabric(self): expected_image = load_numpy( f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" ) - assert np.abs((expected_image - image).max()) < 1e-2 + assert np.abs((expected_image - np.array(image)).max()) < 1e-2 def test_fabric_feedback(self): generator = torch.manual_seed(0) - pipe = DiTPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" @@ -203,4 +201,4 @@ def test_fabric_feedback(self): expected_image = load_numpy( f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" ) - assert np.abs((expected_image - image).max()) < 1e-2 + assert np.abs((expected_image - np.array(image)).max()) < 1e-2 From ea2ada89313068ed8c529375c7980fec38f5aae4 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 17 Aug 2023 13:02:48 +0530 Subject: [PATCH 68/98] reverting back to orig --- .../pipeline_stable_diffusion.py | 284 ++++++++++-------- 1 file changed, 159 insertions(+), 125 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index ce69b83dad68..9bc2ad57fdcc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -13,13 +13,16 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch from packaging import version -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -52,32 +55,51 @@ """ -class StableDiffusionPipeline(DiffusionPipeline): +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the - library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. - text_encoder ([`CLIPTextModel`]): - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. - Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): - Model that extracts features from generated images to be used as inputs for the `safety_checker`. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. """ _optional_components = ["safety_checker", "feature_extractor"] @@ -89,7 +111,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__() @@ -168,59 +190,56 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_vae_slicing(self): r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. """ self.vae.enable_slicing() def disable_vae_slicing(self): r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to computing decoding in one step. """ self.vae.disable_slicing() - def enable_sequential_cpu_offload(self, gpu_id=0): + def enable_vae_tiling(self): r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. """ - if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): - from accelerate import cpu_offload - else: - raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - - device = torch.device(f"cuda:{gpu_id}") + self.vae.enable_tiling() - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() def enable_model_cpu_offload(self, gpu_id=0): r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a + time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs. + Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the + iterative execution of the `unet`. """ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): from accelerate import cpu_offload_with_hook else: - raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") device = torch.device(f"cuda:{gpu_id}") + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + hook = None for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) @@ -231,24 +250,6 @@ def enable_model_cpu_offload(self, gpu_id=0): # We'll offload the last model manually. self.final_offload_hook = hook - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, @@ -258,6 +259,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -273,8 +275,8 @@ def _encode_prompt( whether to use classifier free guidance or not negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. - Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. @@ -282,7 +284,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -291,6 +300,10 @@ def _encode_prompt( batch_size = prompt_embeds.shape[0] if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + text_inputs = self.tokenizer( prompt, padding="max_length", @@ -323,7 +336,14 @@ def _encode_prompt( ) prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method @@ -335,7 +355,7 @@ def _encode_prompt( uncond_tokens: List[str] if negative_prompt is None: uncond_tokens = [""] * batch_size - elif type(prompt) is not type(negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}." @@ -351,6 +371,10 @@ def _encode_prompt( else: uncond_tokens = negative_prompt + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -375,7 +399,7 @@ def _encode_prompt( # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) @@ -388,18 +412,27 @@ def _encode_prompt( return prompt_embeds def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( images=image, clip_input=safety_checker_input.pixel_values.to(dtype) ) - else: - has_nsfw_concept = None return image, has_nsfw_concept def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents).sample + image = self.vae.decode(latents, return_dict=False)[0] image = (image / 2 + 0.5).clamp(0, 1) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 image = image.cpu().permute(0, 2, 3, 1).float().numpy() @@ -507,75 +540,72 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, ): r""" - Function invoked when calling the pipeline for generation. + The call function to the pipeline for generation. Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. - width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. - Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. + tensor is generated by sampling using the supplied random `generator`. prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under - `self.processor` in - [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor @@ -601,6 +631,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -609,6 +642,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps @@ -616,7 +650,7 @@ def __call__( timesteps = self.scheduler.timesteps # 5. Prepare latent variables - num_channels_latents = self.unet.in_channels + num_channels_latents = self.unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, @@ -645,15 +679,20 @@ def __call__( t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): @@ -661,24 +700,19 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type == "latent": + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: image = latents has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - - # 10. Convert to PIL - image = self.numpy_to_pil(image) + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] else: - # 8. Post-processing - image = self.decode_latents(latents) + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: From 994ba48b515cb037b36fe5c8da54930288ac8cea Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 18 Aug 2023 22:59:18 +0530 Subject: [PATCH 69/98] changes --- .../pipelines/fabric/pipeline_fabric.py | 36 ++--- tests/pipelines/fabric/test_fabric.py | 124 +++++++++--------- 2 files changed, 83 insertions(+), 77 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 3616ef51e964..aca768bafeff 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,5 +1,3 @@ -### I'm fucking wrong you dont have to initialize and load stable diffusion ditch that -### do it with raw unet, vae and stuff ' # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -346,7 +344,7 @@ def new_forward( return out def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: - images_t = [self.image_to_tensor(img,dtype) for img in images] + images_t = [self.image_to_tensor(img, dtype) for img in images] images_t = torch.stack(images_t).to(device) latents = ( vae.config.scaling_factor @@ -362,9 +360,7 @@ def decode_latents(self, latents): ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents, return_dict=False)[0] - image = (image / 2 + 0.5).clamp(0, 1) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image @torch.no_grad() @@ -375,10 +371,11 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", liked: Optional[Union[List[str], List[Image.Image]]] = [], disliked: Optional[Union[List[str], List[Image.Image]]] = [], - random_seed: int = 37, + random_seed: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + return_dict: bool = True, n_images: int = 4, guidance_scale: float = 7.0, - denoising_steps: int = 20, + num_inference_steps: int = 20, feedback_start_ratio: float = 0.33, feedback_end_ratio: float = 0.66, min_weight: float = 0.05, @@ -386,6 +383,7 @@ def __call__( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, + output_type: Optional[str] = "pil", ): r""" Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. @@ -413,7 +411,7 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. - denoising_steps (`int`, *optional*, defaults to 50): + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -429,15 +427,12 @@ def __call__( >>> image = pipe(prompt, n_images=4, liked=liked,disliked=disliked).images Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: When returning a tuple, the first element is a list with the generated images, and the second element is a list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - if random_seed is not None and random_seed is not torch.Generator: - torch.manual_seed(random_seed) device = self._execution_device dtype = self.text_encoder.dtype @@ -449,10 +444,14 @@ def __call__( if isinstance(prompt, str): prompt = [prompt] * n_images + elif isinstance(prompt, list): + prompt = prompt else: assert len(prompt) == n_images if isinstance(negative_prompt, str): negative_prompt = [negative_prompt] * n_images + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt else: assert len(negative_prompt) == n_images @@ -461,17 +460,17 @@ def __call__( batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) - self.scheduler.set_timesteps(denoising_steps, device=device) + self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps latent_noise = latent_noise * self.scheduler.init_noise_sigma - num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order ref_start_idx = round(len(timesteps) * feedback_start_ratio) ref_end_idx = round(len(timesteps) * feedback_end_ratio) - with tqdm(total=denoising_steps) as pbar: + with self.progress_bar(total=num_inference_steps) as pbar: for i, t in enumerate(timesteps): sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 if hasattr(self.scheduler, "sigmas"): @@ -547,9 +546,12 @@ def __call__( pbar.update() y = self.decode_latents(latent_noise) - imgs = self.image_processor.numpy_to_pil(y) + imgs = self.image_processor.postprocess(y, output_type=output_type) - return FabricPipelineOutput(imgs,False) + if not return_dict: + return (imgs) + + return FabricPipelineOutput(imgs, False) @staticmethod def image_to_tensor(image: Union[str, Image.Image], dtype): diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 23eddb475068..757bcb0f4cf9 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -14,32 +14,51 @@ # limitations under the License. import gc +import tempfile +import time +import traceback import unittest import numpy as np import torch - -from diffusers import AutoencoderKL, DDIMScheduler, FabricPipeline, DPMSolverMultistepScheduler, Transformer2DModel -from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu - -from ..pipeline_params import ( - CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, - CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, +from huggingface_hub import hf_hub_download +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + FabricPipeline, + UNet2DConditionModel, + logging, +) +from diffusers.models.attention_processor import AttnProcessor, LoRAXFormersAttnProcessor +from diffusers.utils import load_numpy, nightly, slow, torch_device +from diffusers.utils.testing_utils import ( + CaptureLogger, + enable_full_determinism, + require_torch_2, + require_torch_gpu, + run_test_in_subprocess, ) -from ..test_pipelines_common import PipelineTesterMixin + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() class FabricPipelineFastTests( - PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase + PipelineTesterMixin, unittest.TestCase ): - pipeline_class = FabricDiffusionPipeline - params = TEXT_TO_IMAGE_PARAMS + pipeline_class = FabricPipeline batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - {'negative_prompt_embeds', 'width', 'prompt_embeds', 'cross_attention_kwargs', 'height'} image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): @@ -54,6 +73,7 @@ def get_dummy_components(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, ) + torch.manual_seed(0) scheduler = EulerAncestralDiscreteScheduler() torch.manual_seed(0) vae = AutoencoderKL( @@ -95,70 +115,54 @@ def get_dummy_inputs(self, device, seed=0): generator = torch.Generator(device=device).manual_seed(seed) inputs = { "prompt": "A painting of a squirrel eating a burger", - "random_ssed": generator, - "num_images": 1, + "negative_prompt": "lowres, dark, cropped", + "random_seed": generator, + "n_images": 1, + "num_inference_steps":2, + "output_type": "numpy" } return inputs - def test_stable_diffusion_ddim(self): + def test_fabric(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() - sd_pipe = FabricPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) + pipe = FabricPipeline(**components) + pipe = pipe.to(torch_device) - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images + pipe.set_progress_bar_config(disable=True) - image_slice = image[0, -3:, -3:, -1] + inputs = self.get_dummy_inputs(device) + output = pipe(**inputs) + image = np.array(output.images[0]) + image_slice = image[-3:, -3:, -1] + print(image_slice.flatten()/128) + assert image.shape == (128, 128, 3) + expected_slice = np.array([0.44185049, 0.06685049, 0.14494485, 0.62536765, 0.16056985, 0.22693015, 0.03474265, 0.10505515, 0.1010723]) - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864]) + assert np.abs(image_slice.flatten()/128 - expected_slice).max() < 1e-2 - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_fabric_w_fb(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator - def test_stable_diffusion_negative_prompt_embeds(self): components = self.get_dummy_components() - sd_pipe = FabricPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - negative_prompt = 3 * ["this is a negative prompt"] - inputs["negative_prompt"] = negative_prompt - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = sd_pipe.tokenizer( - p, - padding="max_length", - max_length=sd_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) + pipe = FabricPipeline(**components) + pipe = pipe.to(torch_device) - embeds.append(sd_pipe.text_encoder(text_inputs)[0]) + pipe.set_progress_bar_config(disable=True) - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds + inputs = self.get_dummy_inputs(device) + output = pipe(**inputs) + image = output.images[0] + inputs["liked"] = [image] + output = pipe(**inputs) + image_slice = np.array(output.images[0])[-3:, -3:, -1] - # forward - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] + assert image.shape == (128, 128, 3) + expected_slice = np.array([0.77254902, 0.77647059, 0.78431373, 0.8, 0.78823529, 0.79607843, 0.78823529, 0.78823529, 0.78039216]) - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 + assert np.abs(image_slice.flatten()/128 - expected_slice).max() < 1e-2 @require_torch_gpu From 0025a206f66fac36f0c1ea09fcaa834f25527d8d Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 04:02:31 +0530 Subject: [PATCH 70/98] test passing --- tests/pipelines/fabric/test_fabric.py | 50 +++++++++++++++------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 757bcb0f4cf9..0ef4067ce329 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -20,6 +20,7 @@ import unittest import numpy as np +from PIL import Image import torch from huggingface_hub import hf_hub_download from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer @@ -57,9 +58,15 @@ class FabricPipelineFastTests( PipelineTesterMixin, unittest.TestCase ): pipeline_class = FabricPipeline - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - {'negative_prompt_embeds', 'width', 'prompt_embeds', 'cross_attention_kwargs', 'height'} - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + params = TEXT_TO_IMAGE_PARAMS - {'negative_prompt_embeds', 'width', 'prompt_embeds', 'cross_attention_kwargs', 'height','callback', 'callback_steps'} + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - { + "latents", + "num_images_per_prompt", + "callback", + "callback_steps", + } def get_dummy_components(self): torch.manual_seed(0) @@ -109,17 +116,14 @@ def get_dummy_components(self): return components def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) + generator = torch.manual_seed(seed) inputs = { "prompt": "A painting of a squirrel eating a burger", "negative_prompt": "lowres, dark, cropped", - "random_seed": generator, - "n_images": 1, + "generator": generator, + "num_images": 1, "num_inference_steps":2, - "output_type": "numpy" + "output_type": "np", } return inputs @@ -128,19 +132,19 @@ def test_fabric(self): components = self.get_dummy_components() pipe = FabricPipeline(**components) - pipe = pipe.to(torch_device) + pipe = pipe.to(device) pipe.set_progress_bar_config(disable=True) inputs = self.get_dummy_inputs(device) output = pipe(**inputs) - image = np.array(output.images[0]) - image_slice = image[-3:, -3:, -1] - print(image_slice.flatten()/128) - assert image.shape == (128, 128, 3) - expected_slice = np.array([0.44185049, 0.06685049, 0.14494485, 0.62536765, 0.16056985, 0.22693015, 0.03474265, 0.10505515, 0.1010723]) + image = output.images + image_slice = image[0, -3:, -3:, -1] + print(image_slice.flatten()) + assert image.shape == (1, 128, 128, 3) + expected_slice = np.array([0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524]) - assert np.abs(image_slice.flatten()/128 - expected_slice).max() < 1e-2 + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 def test_fabric_w_fb(self): @@ -148,18 +152,18 @@ def test_fabric_w_fb(self): components = self.get_dummy_components() pipe = FabricPipeline(**components) - pipe = pipe.to(torch_device) + pipe = pipe.to(device) pipe.set_progress_bar_config(disable=True) inputs = self.get_dummy_inputs(device) + inputs["liked"] = [Image.fromarray(np.ones((512,512)))] output = pipe(**inputs) - image = output.images[0] - inputs["liked"] = [image] - output = pipe(**inputs) - image_slice = np.array(output.images[0])[-3:, -3:, -1] + image = output.images + image_slice = output.images[0, -3:, -3:, -1] - assert image.shape == (128, 128, 3) + assert image.shape == (1, 128, 128, 3) + print(image_slice) expected_slice = np.array([0.77254902, 0.77647059, 0.78431373, 0.8, 0.78823529, 0.79607843, 0.78823529, 0.78823529, 0.78039216]) assert np.abs(image_slice.flatten()/128 - expected_slice).max() < 1e-2 From 04697a01644114ab93346ef7eb07a12de4743cac Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 04:05:58 +0530 Subject: [PATCH 71/98] pipeline changes --- .../pipelines/fabric/pipeline_fabric.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index aca768bafeff..ea1aa981242e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -217,7 +217,8 @@ def initialize_prompts(self, prompts: List[str], device): prompt_embed_list.append(prompt_embd) - return torch.cat(prompt_embed_list, dim=0) + all_prompt_embed = torch.cat(prompt_embed_list, dim=0) + return all_prompt_embed def get_unet_hidden_states(self, z_all, t, prompt_embd): cached_hidden_states = [] @@ -371,9 +372,9 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", liked: Optional[Union[List[str], List[Image.Image]]] = [], disliked: Optional[Union[List[str], List[Image.Image]]] = [], - random_seed: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, return_dict: bool = True, - n_images: int = 4, + num_images: int = 4, guidance_scale: float = 7.0, num_inference_steps: int = 20, feedback_start_ratio: float = 0.33, @@ -384,6 +385,7 @@ def __call__( pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, output_type: Optional[str] = "pil", + latents: Optional[torch.FloatTensor] = None, ): r""" Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. @@ -400,10 +402,10 @@ def __call__( Liked enables feedback through images, encourages images with liked features. disliked (`List[Image.Image]` or `List[str]`, *optional*): Disliked enables feedback through images, discourages images with disliked features. - random_seed (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. to make generation deterministic. - n_images (`int`, *optional*, defaults to 1): + num_images (`int`, *optional*, defaults to 1): The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). @@ -424,7 +426,7 @@ def __call__( >>> prompt = "a giant standing in a fantasy landscape best quality" >>> liked = [] >>> disliked = [] - >>> image = pipe(prompt, n_images=4, liked=liked,disliked=disliked).images + >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images Returns: [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: @@ -435,28 +437,28 @@ def __call__( """ device = self._execution_device - dtype = self.text_encoder.dtype + dtype = self.unet.dtype - latent_noise = torch.randn(n_images, 4, 64, 64, device=device, dtype=dtype) + latent_noise = torch.randn(num_images, 4, 64, 64, device=device, dtype=dtype) positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) if isinstance(prompt, str): - prompt = [prompt] * n_images + prompt = [prompt] * num_images elif isinstance(prompt, list): prompt = prompt else: - assert len(prompt) == n_images + assert len(prompt) == num_images if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] * n_images + negative_prompt = [negative_prompt] * num_images elif isinstance(negative_prompt, list): negative_prompt = negative_prompt else: - assert len(negative_prompt) == n_images + assert len(negative_prompt) == num_images - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([n_images, n_images, 1]) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([num_images, num_images, 1]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) @@ -511,10 +513,10 @@ def __call__( cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) cached_pos = cached_pos.view( 1, -1, *cached_pos.shape[2:] - ).expand(n_images, -1, -1) + ).expand(num_images, -1, -1) cached_neg = cached_neg.view( 1, -1, *cached_neg.shape[2:] - ).expand(n_images, -1, -1) + ).expand(num_images, -1, -1) cached_pos_hs.append(cached_pos) cached_neg_hs.append(cached_neg) From 79be7ffb142bcdfe22e165dc615236e8883eba9c Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 04:35:24 +0530 Subject: [PATCH 72/98] before quality --- .../pipelines/fabric/pipeline_fabric.py | 46 ++++++++++++++++--- tests/pipelines/fabric/test_fabric.py | 9 +--- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ea1aa981242e..0b50e5fe70e2 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -364,6 +364,30 @@ def decode_latents(self, latents): # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 return image + def check_inputs( + self, + prompt, + negative_prompt=None, + liked = None, + disliked = None, + ): + if prompt is None: + raise ValueError( + "Provide `prompt`. Cannot leave both `prompt` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and (not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)): + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + if liked is not None and not isinstance(liked, list): + raise ValueError(f"`liked` has to be of type `list` but is {type(liked)}") + + if disliked is not None and not isinstance(disliked, list): + raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}") + + @torch.no_grad() def __call__( @@ -436,6 +460,13 @@ def __call__( """ + self.check_inputs( + prompt, + negative_prompt, + liked, + disliked + ) + device = self._execution_device dtype = self.unet.dtype @@ -444,12 +475,15 @@ def __call__( positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) - if isinstance(prompt, str): - prompt = [prompt] * num_images - elif isinstance(prompt, list): - prompt = prompt + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) else: - assert len(prompt) == num_images + batch_size = None + + prompt = [prompt] * num_images + if isinstance(negative_prompt, str): negative_prompt = [negative_prompt] * num_images elif isinstance(negative_prompt, list): @@ -458,7 +492,7 @@ def __call__( assert len(negative_prompt) == num_images - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([num_images, num_images, 1]) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([num_images, num_images, batch_size*num_images]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 0ef4067ce329..6f7960184e9a 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -27,17 +27,12 @@ from diffusers import ( AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, FabricPipeline, UNet2DConditionModel, logging, ) -from diffusers.models.attention_processor import AttnProcessor, LoRAXFormersAttnProcessor +from diffusers.models.attention_processor import AttnProcessor from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import ( CaptureLogger, @@ -48,7 +43,7 @@ ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin enable_full_determinism() From ba8aa35e96ff9912cb3fb3e84b3ce219c09643a2 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 04:41:30 +0530 Subject: [PATCH 73/98] quality checks pass --- src/diffusers/pipelines/__init__.py | 3 +- src/diffusers/pipelines/fabric/__init__.py | 11 +- .../pipelines/fabric/pipeline_fabric.py | 250 +++++++----------- src/test.py | 13 - tests/pipelines/fabric/test_fabric.py | 50 ++-- 5 files changed, 132 insertions(+), 195 deletions(-) delete mode 100644 src/test.py diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 44f7e15e2a1b..a61b601c7e7c 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -52,8 +52,6 @@ StableDiffusionControlNetPipeline, StableDiffusionXLControlNetPipeline, ) - from .fabric import FabricPipeline - from .deepfloyd_if import ( IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, @@ -62,6 +60,7 @@ IFPipeline, IFSuperResolutionPipeline, ) + from .fabric import FabricPipeline from .kandinsky import ( KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py index ca8e828d2ae6..a202d7fbf512 100644 --- a/src/diffusers/pipelines/fabric/__init__.py +++ b/src/diffusers/pipelines/fabric/__init__.py @@ -1,12 +1,15 @@ from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL + from ...utils import ( BaseOutput, OptionalDependencyNotAvailable, is_torch_available, ) -from typing import Union, Optional, List -import numpy as np -import PIL + @dataclass class FabricPipelineOutput(BaseOutput): @@ -25,6 +28,7 @@ class FabricPipelineOutput(BaseOutput): images: Union[List[PIL.Image.Image], np.ndarray] nsfw_content_detected: Optional[List[bool]] + try: if not is_torch_available(): raise OptionalDependencyNotAvailable() @@ -32,4 +36,3 @@ class FabricPipelineOutput(BaseOutput): from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: from .pipeline_fabric import FabricPipeline - diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 0b50e5fe70e2..5f4dd555c633 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -11,45 +11,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union -from packaging import version +import warnings +from typing import List, Optional, Union +import numpy as np import torch -from torch import nn -from torch.nn import functional as F +from packaging import version from PIL import Image -import numpy as np -import warnings -from tqdm import tqdm - -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPTextModel, CLIPTokenizer +from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel +from ...models.attention import BasicTransformerBlock +from ...models.attention_processor import LoRAAttnProcessor +from ...schedulers import EulerAncestralDiscreteScheduler from ...utils import ( deprecate, - is_accelerate_available, - is_accelerate_version, logging, - randn_tensor, - replace_example_docstring, - BaseOutput, ) - -from ...configuration_utils import ConfigMixin, register_to_config -from ...models.attention_processor import LoRAAttnProcessor -from ...models.attention import BasicTransformerBlock -from ...schedulers import EulerAncestralDiscreteScheduler -from . import FabricPipelineOutput - from ..pipeline_utils import DiffusionPipeline +from . import FabricPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class CrossAttnProcessor(): + +class CrossAttnProcessor: def __init__(self): self.attntion_probs = None @@ -64,18 +52,12 @@ def __call__( ): print("in") batch_size, sequence_length, _ = ( - hidden_states.shape - if encoder_hidden_states is None - else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if isinstance(attn.processor, LoRAAttnProcessor): - query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora( - hidden_states - ) + query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora(hidden_states) else: query = attn.to_q(hidden_states) @@ -85,12 +67,8 @@ def __call__( encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) if isinstance(attn.processor, LoRAAttnProcessor): - key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora( - encoder_hidden_states - ) - value = attn.to_v( - encoder_hidden_states - ) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) + key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states) else: key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -112,9 +90,7 @@ def __call__( # linear proj if isinstance(attn.processor, LoRAAttnProcessor): - hidden_states = attn.to_out[0]( - hidden_states - ) + lora_scale * attn.processor.to_out_lora(hidden_states) + hidden_states = attn.to_out[0](hidden_states) + lora_scale * attn.processor.to_out_lora(hidden_states) else: hidden_states = attn.to_out[0](hidden_states) # dropout @@ -122,10 +98,10 @@ def __call__( return hidden_states + class FabricPipeline(DiffusionPipeline): r""" - Pipeline for text-to-image generation using Stable Diffusion and conditioning the results - using feedback images. + Pipeline for text-to-image generation using Stable Diffusion and conditioning the results using feedback images. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) @@ -148,6 +124,7 @@ class FabricPipeline(DiffusionPipeline): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. """ + def __init__( self, vae: AutoencoderKL, @@ -155,7 +132,7 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: EulerAncestralDiscreteScheduler, - requires_safety_checker:bool = True, + requires_safety_checker: bool = True, ): super().__init__() @@ -182,42 +159,44 @@ def __init__( unet._internal_dict = FrozenDict(new_config) self.register_modules( - unet = unet, - vae = vae, - text_encoder = text_encoder, - tokenizer = tokenizer, - scheduler = scheduler, + unet=unet, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - def initialize_prompts(self, prompts: List[str], device): # Breaking into individual prompts feels memory efficient prompt_embed_list = [] for prompt in prompts: - prompt_tokens = self.tokenizer( - prompt, - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - attention_mask = prompt_tokens.attention_mask.to(device) if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ) else None + prompt_tokens = self.tokenizer( + prompt, + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + attention_mask = ( + prompt_tokens.attention_mask.to(device) + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ) + else None + ) - prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(device), - attention_mask=attention_mask, - ).last_hidden_state + prompt_embd = self.text_encoder( + input_ids=prompt_tokens.input_ids.to(device), + attention_mask=attention_mask, + ).last_hidden_state - prompt_embed_list.append(prompt_embd) + prompt_embed_list.append(prompt_embd) - all_prompt_embed = torch.cat(prompt_embed_list, dim=0) + all_prompt_embed = torch.cat(prompt_embed_list, dim=0) return all_prompt_embed def get_unet_hidden_states(self, z_all, t, prompt_embd): @@ -256,10 +235,8 @@ def unet_forward_with_cached_hidden_states( if cached_pos_hiddens is None and cached_neg_hiddens is None: return self.unet(z_all, t, encoder_hidden_states=prompt_embd) - local_pos_weights = torch.linspace( - *pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() - local_neg_weights = torch.linspace( - *neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() + local_pos_weights = torch.linspace(*pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() + local_neg_weights = torch.linspace(*neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist() for block, pos_weight, neg_weight in zip( self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks, local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1], @@ -279,23 +256,15 @@ def new_forward( batch_size, d_model = cond_hiddens.shape[:2] device, dtype = hidden_states.device, hidden_states.dtype - weights = torch.ones( - batch_size, d_model, device=device, dtype=dtype - ) + weights = torch.ones(batch_size, d_model, device=device, dtype=dtype) out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) if cached_pos_hiddens is not None: - cached_pos_hs = cached_pos_hiddens.pop(0).to( - hidden_states.device - ) - cond_pos_hs = torch.cat( - [cond_hiddens, cached_pos_hs], dim=1 - ) - pos_weights = weights.clone().repeat( - 1, 1 + cached_pos_hs.shape[1] // d_model - ) + cached_pos_hs = cached_pos_hiddens.pop(0).to(hidden_states.device) + cond_pos_hs = torch.cat([cond_hiddens, cached_pos_hs], dim=1) + pos_weights = weights.clone().repeat(1, 1 + cached_pos_hs.shape[1] // d_model) pos_weights[:, d_model:] = pos_weight attn_with_weights = CrossAttnProcessor() out_pos = attn_with_weights( @@ -308,15 +277,9 @@ def new_forward( out_pos = self.old_forward(cond_hiddens) if cached_neg_hiddens is not None: - cached_neg_hs = cached_neg_hiddens.pop(0).to( - hidden_states.device - ) - uncond_neg_hs = torch.cat( - [uncond_hiddens, cached_neg_hs], dim=1 - ) - neg_weights = weights.clone().repeat( - 1, 1 + cached_neg_hs.shape[1] // d_model - ) + cached_neg_hs = cached_neg_hiddens.pop(0).to(hidden_states.device) + uncond_neg_hs = torch.cat([uncond_hiddens, cached_neg_hs], dim=1) + neg_weights = weights.clone().repeat(1, 1 + cached_neg_hs.shape[1] // d_model) neg_weights[:, d_model:] = neg_weight attn_with_weights = CrossAttnProcessor() out_neg = attn_with_weights( @@ -347,10 +310,7 @@ def new_forward( def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: images_t = [self.image_to_tensor(img, dtype) for img in images] images_t = torch.stack(images_t).to(device) - latents = ( - vae.config.scaling_factor - * vae.encode(images_t).latent_dist.sample() - ) + latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() return latents def decode_latents(self, latents): @@ -368,17 +328,17 @@ def check_inputs( self, prompt, negative_prompt=None, - liked = None, - disliked = None, + liked=None, + disliked=None, ): if prompt is None: - raise ValueError( - "Provide `prompt`. Cannot leave both `prompt` undefined." - ) + raise ValueError("Provide `prompt`. Cannot leave both `prompt` undefined.") elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if negative_prompt is not None and (not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)): + if negative_prompt is not None and ( + not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list) + ): raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") if liked is not None and not isinstance(liked, list): @@ -387,9 +347,7 @@ def check_inputs( if disliked is not None and not isinstance(disliked, list): raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}") - @torch.no_grad() - def __call__( self, prompt: Optional[Union[str, List[str]]] = "", @@ -404,7 +362,7 @@ def __call__( feedback_start_ratio: float = 0.33, feedback_end_ratio: float = 0.66, min_weight: float = 0.05, - max_weight: float = .8, + max_weight: float = 0.8, neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, @@ -412,7 +370,8 @@ def __call__( latents: Optional[torch.FloatTensor] = None, ): r""" - Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary feedback. The feedback can be given as a list of liked and disliked images. + Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary + feedback. The feedback can be given as a list of liked and disliked images. Args: prompt (`str` or `List[str]`, *optional*): @@ -427,8 +386,8 @@ def __call__( disliked (`List[Image.Image]` or `List[str]`, *optional*): Disliked enables feedback through images, discourages images with disliked features. generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. - to make generation deterministic. + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), + can be int. to make generation deterministic. num_images (`int`, *optional*, defaults to 1): The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 7.5): @@ -442,38 +401,35 @@ def __call__( expense of slower inference. Examples: - >>> from diffusers import FabricPipeline - >>> import torch - >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" - >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) - >>> pipe = pipe.to("cuda") - >>> prompt = "a giant standing in a fantasy landscape best quality" - >>> liked = [] - >>> disliked = [] - >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images + >>> from diffusers import FabricPipeline >>> import torch >>> model_id = + "dreamlike-art/dreamlike-photoreal-2.0" >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) + >>> pipe = pipe.to("cuda") >>> prompt = "a giant standing in a fantasy landscape best quality" >>> liked = + [] >>> disliked = [] >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: When returning a tuple, the first element is a list + with the generated images, and the second element is a list of `bool`s denoting whether the corresponding + generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - self.check_inputs( - prompt, - negative_prompt, - liked, - disliked - ) + self.check_inputs(prompt, negative_prompt, liked, disliked) device = self._execution_device dtype = self.unet.dtype latent_noise = torch.randn(num_images, 4, 64, 64, device=device, dtype=dtype) - positive_latents = self.preprocess_feedback_images(liked,self.vae,device, dtype) if liked and len(liked)>0 else torch.tensor([], device=device, dtype=dtype) - negative_latents = self.preprocess_feedback_images(disliked,self.vae,device, dtype) if disliked and len(disliked)>0 else torch.tensor([], device=device, dtype=dtype) + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, device, dtype) + if liked and len(liked) > 0 + else torch.tensor([], device=device, dtype=dtype) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, device, dtype) + if disliked and len(disliked) > 0 + else torch.tensor([], device=device, dtype=dtype) + ) if isinstance(prompt, str) and prompt is not None: batch_size = 1 @@ -491,8 +447,9 @@ def __call__( else: assert len(negative_prompt) == num_images - - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts(prompt + negative_prompt + [""], device).split([num_images, num_images, batch_size*num_images]) + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts( + prompt + negative_prompt + [""], device + ).split([num_images, num_images, batch_size * num_images]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) @@ -508,7 +465,7 @@ def __call__( with self.progress_bar(total=num_inference_steps) as pbar: for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, 'sigma_t') else 0 + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 if hasattr(self.scheduler, "sigmas"): sigma = self.scheduler.sigmas[i] @@ -529,28 +486,22 @@ def __call__( if z_ref.size(0) > 0 and weight_factor > 0: noise = torch.randn_like(z_ref) if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = ( - alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise - ).type(dtype) + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) print("here") else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - ref_prompt_embd = torch.cat([null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0) - cached_hidden_states = self.get_unet_hidden_states( - z_ref_noised, t, ref_prompt_embd + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] cached_pos_hs, cached_neg_hs = [], [] for hs in cached_hidden_states: cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view( - 1, -1, *cached_pos.shape[2:] - ).expand(num_images, -1, -1) - cached_neg = cached_neg.view( - 1, -1, *cached_neg.shape[2:] - ).expand(num_images, -1, -1) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) cached_pos_hs.append(cached_pos) cached_neg_hs.append(cached_neg) @@ -576,16 +527,14 @@ def __call__( noise_pred = noise_uncond + guidance_scale * guidance latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 - ): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): pbar.update() y = self.decode_latents(latent_noise) imgs = self.image_processor.postprocess(y, output_type=output_type) if not return_dict: - return (imgs) + return imgs return FabricPipelineOutput(imgs, False) @@ -603,4 +552,3 @@ def image_to_tensor(image: Union[str, Image.Image], dtype): image = (image / 127.5 - 1.0).astype(np.float32) image = torch.from_numpy(image).permute(2, 0, 1) return image.type(dtype) - diff --git a/src/test.py b/src/test.py deleted file mode 100644 index fd7ba1a1e97d..000000000000 --- a/src/test.py +++ /dev/null @@ -1,13 +0,0 @@ -from diffusers import FabricPipeline -import torch - -model_id = "dreamlike-art/dreamlike-photoreal-2.0" -pipe = FabricPipeline.from_pretrained(model_id, torch_dtype=torch.float16) -pipe = pipe.to("cuda") - -prompt = "photo, a church in the middle of a field of crops, bright cinematic lighting, gopro, fisheye lens" -image = pipe(prompt, n_images=1).images[0] - -image.save("result.jpg") - - diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 6f7960184e9a..4e88fad71c07 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -14,15 +14,11 @@ # limitations under the License. import gc -import tempfile -import time -import traceback import unittest import numpy as np -from PIL import Image import torch -from huggingface_hub import hf_hub_download +from PIL import Image from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import ( @@ -30,16 +26,11 @@ EulerAncestralDiscreteScheduler, FabricPipeline, UNet2DConditionModel, - logging, ) -from diffusers.models.attention_processor import AttnProcessor -from diffusers.utils import load_numpy, nightly, slow, torch_device +from diffusers.utils import load_numpy, slow from diffusers.utils.testing_utils import ( - CaptureLogger, enable_full_determinism, - require_torch_2, require_torch_gpu, - run_test_in_subprocess, ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS @@ -49,13 +40,19 @@ enable_full_determinism() -class FabricPipelineFastTests( - PipelineTesterMixin, unittest.TestCase -): +class FabricPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = FabricPipeline - params = TEXT_TO_IMAGE_PARAMS - {'negative_prompt_embeds', 'width', 'prompt_embeds', 'cross_attention_kwargs', 'height','callback', 'callback_steps'} - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + params = TEXT_TO_IMAGE_PARAMS - { + "negative_prompt_embeds", + "width", + "prompt_embeds", + "cross_attention_kwargs", + "height", + "callback", + "callback_steps", + } + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS required_optional_params = PipelineTesterMixin.required_optional_params - { "latents", "num_images_per_prompt", @@ -117,7 +114,7 @@ def get_dummy_inputs(self, device, seed=0): "negative_prompt": "lowres, dark, cropped", "generator": generator, "num_images": 1, - "num_inference_steps":2, + "num_inference_steps": 2, "output_type": "np", } return inputs @@ -137,11 +134,12 @@ def test_fabric(self): image_slice = image[0, -3:, -3:, -1] print(image_slice.flatten()) assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524]) + expected_slice = np.array( + [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - def test_fabric_w_fb(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator @@ -152,16 +150,18 @@ def test_fabric_w_fb(self): pipe.set_progress_bar_config(disable=True) inputs = self.get_dummy_inputs(device) - inputs["liked"] = [Image.fromarray(np.ones((512,512)))] + inputs["liked"] = [Image.fromarray(np.ones((512, 512)))] output = pipe(**inputs) image = output.images image_slice = output.images[0, -3:, -3:, -1] assert image.shape == (1, 128, 128, 3) print(image_slice) - expected_slice = np.array([0.77254902, 0.77647059, 0.78431373, 0.8, 0.78823529, 0.79607843, 0.78823529, 0.78823529, 0.78039216]) + expected_slice = np.array( + [0.77254902, 0.77647059, 0.78431373, 0.8, 0.78823529, 0.79607843, 0.78823529, 0.78823529, 0.78039216] + ) - assert np.abs(image_slice.flatten()/128 - expected_slice).max() < 1e-2 + assert np.abs(image_slice.flatten() / 128 - expected_slice).max() < 1e-2 @require_torch_gpu @@ -184,7 +184,7 @@ def test_fabric(self): for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" ) assert np.abs((expected_image - np.array(image)).max()) < 1e-2 @@ -202,6 +202,6 @@ def test_fabric_feedback(self): for word, image in zip(prompt, images): expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" ) assert np.abs((expected_image - np.array(image)).max()) < 1e-2 From 9d1297d920dd377e168fb2f0f2cdeae3e9d0524a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 13:30:17 +0530 Subject: [PATCH 74/98] remove print statements --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 5f4dd555c633..163e284c5df2 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -50,7 +50,6 @@ def __call__( weights=None, # shape: (batch_size, sequence_length) lora_scale=1.0, ): - print("in") batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) @@ -487,7 +486,6 @@ def __call__( noise = torch.randn_like(z_ref) if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - print("here") else: z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) From c7b352281828d3d75c300a4258b1c99f00166fa5 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 13:39:49 +0530 Subject: [PATCH 75/98] doc fixes --- docs/source/en/api/pipelines/{fabric.mdx => fabric.md} | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) rename docs/source/en/api/pipelines/{fabric.mdx => fabric.md} (83%) diff --git a/docs/source/en/api/pipelines/fabric.mdx b/docs/source/en/api/pipelines/fabric.md similarity index 83% rename from docs/source/en/api/pipelines/fabric.mdx rename to docs/source/en/api/pipelines/fabric.md index 5cbe737da994..e37e91f5401b 100644 --- a/docs/source/en/api/pipelines/fabric.mdx +++ b/docs/source/en/api/pipelines/fabric.md @@ -15,10 +15,12 @@ specific language governing permissions and limitations under the License. ## FabricPipeline +[FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf + FABRIC is training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models, created by the researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion and finetune them on the basis of feedback. The original codebase can be found here: -- *Stable Diffusion V1*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) +- *FABRIC*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) Available Checkpoints are: - *dreamlike-photoreal-2.0 (512x512 resolution)* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) From a166b55ba27e44161382413d4a6754d415f416a8 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 19 Aug 2023 13:42:04 +0530 Subject: [PATCH 76/98] __init__ error something --- .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index dbddf6e2c593..83ac22ba4566 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -62,6 +62,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class FabricPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class IFImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] From 6261e9d68407dd0f45632e482d28e906c933028a Mon Sep 17 00:00:00 2001 From: shauray8 Date: Mon, 21 Aug 2023 13:03:02 +0530 Subject: [PATCH 77/98] update docs, working on dim --- docs/source/en/api/pipelines/fabric.md | 60 ++++- .../using-diffusers/controlling_generation.md | 14 +- .../pipelines/fabric/pipeline_fabric.py | 233 +++++++++++++++--- 3 files changed, 268 insertions(+), 39 deletions(-) diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index e37e91f5401b..183ec49e5069 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -1,4 +1,3 @@ -## changes required -# Text-to-Image Generation - -## FabricPipeline +# FabricPipeline [FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf FABRIC is training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models, created by the researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion and finetune them on the basis of feedback. +The abstract of the paper is the following: + +*In an era where visual content generation is increasingly driven by machine learning, the integration of human feedback into generative models presents significant opportunities for enhancing user experience and output quality. This study explores strategies for incorporating iterative human feedback into the generative process of diffusion-based text-to-image models. We propose FABRIC, a training-free approach applicable to a wide range of popular diffusion models, which exploits the self-attention layer present in the most widely used architectures to condition the diffusion process on a set of feedback images. To ensure a rigorous assessment of our approach, we introduce a comprehensive evaluation methodology, offering a robust mechanism to quantify the performance of generative visual models that integrate human feedback. We show that generation results improve over multiple rounds of iterative feedback through exhaustive analysis, implicitly optimizing arbitrary user preferences. The potential applications of these findings extend to fields such as personalized content creation and customization* + The original codebase can be found here: - *FABRIC*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) @@ -29,3 +30,54 @@ Available Checkpoints are: - all - __call__ +## Usage Example + +Before using Fabric make sure to have `transformers`, `accelerate`, `huggingface_hub` installed. +You can install the libraries as follows: + +``` +pip install transformers +pip install accelerate +pip install huggingface_hub +``` + +### Text-to-Image + +You can use Fabric as follows for *text-to-image*: + +```py +from diffusers import FabricPipeline +import torch + +model_id = "dreamlike-art/dreamlike-photoreal-2.0" +pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" +neg_prompt = "bad anatomy, cropped, lowres" +image = pipe(prompt=prompt, negative_prompt=neg_prompt).images[0] +``` + +You can use Fabric as follows for *text-to-image-with-feedback*: + +```py +from diffusers import FabricPipeline +import torch + +model_id = "dreamlike-art/dreamlike-photoreal-2.0" +pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" +neg_prompt = "bad anatomy, cropped, lowres" +liked = ["path/to/image"] +disliked = ["path/to/image"] +image = pipe(prompt=prompt, negative_prompt=neg_prompt,liked=liked,disliked=disliked).images[0] +``` + +Let's have a look at the images + +| Without Feedback | With Feedback (1st image) | +|---------------------|---------------------| +| ![Image 1](https://drive.google.com/uc?export=view&id=12wxbikt7834eRTK40legR5PtJmFLNH34) | ![Feedback Image 1](https://drive.google.com/uc?export=view&id=1YcFPDHSRr2OE3hy-5lvr8An21Jum85D5) | + diff --git a/docs/source/en/using-diffusers/controlling_generation.md b/docs/source/en/using-diffusers/controlling_generation.md index 563c6af001f1..2c3d79530ef0 100644 --- a/docs/source/en/using-diffusers/controlling_generation.md +++ b/docs/source/en/using-diffusers/controlling_generation.md @@ -41,6 +41,7 @@ Unless otherwise mentioned, these are techniques that work with existing models 13. [Model Editing](#model-editing) 14. [DiffEdit](#diffedit) 15. [T2I-Adapter](#t2i-adapter) +16. [FABRIC](#fabric) For convenience, we provide a table to denote which methods are inference-only and which require fine-tuning/training. @@ -61,7 +62,7 @@ For convenience, we provide a table to denote which methods are inference-only a | [Model Editing](#model-editing) | ✅ | ❌ | | | [DiffEdit](#diffedit) | ✅ | ❌ | | | [T2I-Adapter](#t2i-adapter) | ✅ | ❌ | | - +| [Fabric](#fabric) | ✅ | ❌ | | ## Instruct Pix2Pix [Paper](https://arxiv.org/abs/2211.09800) @@ -230,3 +231,14 @@ There are 8 canonical pre-trained adapters trained on different conditionings su depth maps, and semantic segmentations. See [here](../api/pipelines/stable_diffusion/adapter) for more information on how to use it. + +## Fabric + +[Paper](https://arxiv.org/abs/2307.10159) + +[DiffEdit](../api/pipelines/fabric) is a training-free +approach applicable to a wide range of popular diffusion models, which exploits +the self-attention layer present in the most widely used architectures to condition +the diffusion process on a set of feedback images. + +To know more details, check out the [official doc](../api/pipelines/fabric). diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 163e284c5df2..3efa2a829c56 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -20,15 +20,17 @@ from PIL import Image from transformers import CLIPTextModel, CLIPTokenizer +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention import BasicTransformerBlock from ...models.attention_processor import LoRAAttnProcessor -from ...schedulers import EulerAncestralDiscreteScheduler +from ...schedulers import KarrasDiffusionSchedulers,EulerAncestralDiscreteScheduler from ...utils import ( deprecate, logging, + replace_example_docstring, ) from ..pipeline_utils import DiffusionPipeline from . import FabricPipelineOutput @@ -36,8 +38,21 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -class CrossAttnProcessor: +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import FabricPipeline + >>> import torch + >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" + >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) + >>> pipe = pipe.to("cuda") + >>> prompt = "a giant standing in a fantasy landscape best quality" + >>> liked = [] + >>> disliked = [] + >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images[0] + ``` +""" +class FabricCrossAttnProcessor: def __init__(self): self.attntion_probs = None @@ -130,7 +145,7 @@ def __init__( text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, - scheduler: EulerAncestralDiscreteScheduler, + scheduler: KarrasDiffusionSchedulers, requires_safety_checker: bool = True, ): super().__init__() @@ -220,6 +235,168 @@ def new_forward(self, hidden_states, *args, **kwargs): del module.attn1.old_forward return cached_hidden_states + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + def unet_forward_with_cached_hidden_states( self, @@ -256,7 +433,7 @@ def new_forward( device, dtype = hidden_states.device, hidden_states.dtype weights = torch.ones(batch_size, d_model, device=device, dtype=dtype) - + print(weights.shape) out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) @@ -265,7 +442,7 @@ def new_forward( cond_pos_hs = torch.cat([cond_hiddens, cached_pos_hs], dim=1) pos_weights = weights.clone().repeat(1, 1 + cached_pos_hs.shape[1] // d_model) pos_weights[:, d_model:] = pos_weight - attn_with_weights = CrossAttnProcessor() + attn_with_weights = FabricCrossAttnProcessor() out_pos = attn_with_weights( self, cond_hiddens, @@ -280,7 +457,7 @@ def new_forward( uncond_neg_hs = torch.cat([uncond_hiddens, cached_neg_hs], dim=1) neg_weights = weights.clone().repeat(1, 1 + cached_neg_hs.shape[1] // d_model) neg_weights[:, d_model:] = neg_weight - attn_with_weights = CrossAttnProcessor() + attn_with_weights = FabricCrossAttnProcessor() out_neg = attn_with_weights( self, uncond_hiddens, @@ -312,17 +489,6 @@ def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() return latents - def decode_latents(self, latents): - warnings.warn( - "The decode_latents method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor instead", - FutureWarning, - ) - latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents, return_dict=False)[0] - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - return image - def check_inputs( self, prompt, @@ -347,6 +513,7 @@ def check_inputs( raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}") @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Optional[Union[str, List[str]]] = "", @@ -367,6 +534,8 @@ def __call__( neg_bottleneck_scale: float = 1.0, output_type: Optional[str] = "pil", latents: Optional[torch.FloatTensor] = None, + pos_weights: Optional[tuple] = (.8,.8), + neg_weights: Optional[tuple] = (.5,.5), ): r""" Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary @@ -400,11 +569,7 @@ def __call__( expense of slower inference. Examples: - >>> from diffusers import FabricPipeline >>> import torch >>> model_id = - "dreamlike-art/dreamlike-photoreal-2.0" >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) - >>> pipe = pipe.to("cuda") >>> prompt = "a giant standing in a fantasy landscape best quality" >>> liked = - [] >>> disliked = [] >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images - + Returns: [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: When returning a tuple, the first element is a list with the generated images, and the second element is a list of `bool`s denoting whether the corresponding @@ -446,9 +611,13 @@ def __call__( else: assert len(negative_prompt) == num_images - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts( - prompt + negative_prompt + [""], device - ).split([num_images, num_images, batch_size * num_images]) + do_classifier_free_guidance = guidance_scale > 1. + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self._encode_prompt( + prompt, device, + num_images, + do_classifier_free_guidance, + negative_prompt, + ).split([num_images, num_images, 1]) batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) @@ -509,7 +678,6 @@ def __call__( cached_neg_hs = None else: cached_pos_hs, cached_neg_hs = None, None - unet_out = self.unet_forward_with_cached_hidden_states( z_all, t, @@ -528,7 +696,7 @@ def __call__( if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): pbar.update() - y = self.decode_latents(latent_noise) + y = self.vae.decode(latent_noise / self.vae.config.scaling_factor, return_dict=False)[0] imgs = self.image_processor.postprocess(y, output_type=output_type) if not return_dict: @@ -536,8 +704,7 @@ def __call__( return FabricPipelineOutput(imgs, False) - @staticmethod - def image_to_tensor(image: Union[str, Image.Image], dtype): + def image_to_tensor(self, image: Union[str, Image.Image], dtype): """ Convert latent PIL image to a torch tensor for further processing. """ @@ -545,8 +712,6 @@ def image_to_tensor(image: Union[str, Image.Image], dtype): image = Image.open(image) if not image.mode == "RGB": image = image.convert("RGB") - image = image.resize((512, 512)) - image = np.array(image).astype(np.uint8) - image = (image / 127.5 - 1.0).astype(np.float32) - image = torch.from_numpy(image).permute(2, 0, 1) + image = self.image_processor.preprocess(image,height=512,width=512)[0] return image.type(dtype) + From 913c69ae65a38b354839ae0adb8837659b65458e Mon Sep 17 00:00:00 2001 From: shauray8 Date: Mon, 21 Aug 2023 13:04:30 +0530 Subject: [PATCH 78/98] working on encoding --- .../pipelines/fabric/pipeline_fabric.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 3efa2a829c56..bae9e2a3aacd 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -11,22 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import List, Optional, Union -import numpy as np import torch from packaging import version from PIL import Image from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention import BasicTransformerBlock from ...models.attention_processor import LoRAAttnProcessor -from ...schedulers import KarrasDiffusionSchedulers,EulerAncestralDiscreteScheduler +from ...schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers from ...utils import ( deprecate, logging, @@ -41,17 +39,20 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> from diffusers import FabricPipeline - >>> import torch - >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" - >>> pipe = FabricPipeline(model_id, torch_dtype = torch.float16) - >>> pipe = pipe.to("cuda") - >>> prompt = "a giant standing in a fantasy landscape best quality" - >>> liked = [] - >>> disliked = [] - >>> image = pipe(prompt, num_images=4, liked=liked,disliked=disliked).images[0] + >>> from diffusers import FabricPipeline + >>> import torch + + >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" + >>> pipe = FabricPipeline(model_id, torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + >>> prompt = "a giant standing in a fantasy landscape best quality" + >>> liked = [] + >>> disliked = [] + >>> image = pipe(prompt, num_images=4, liked=liked, disliked=disliked).images[0] ``` """ + + class FabricCrossAttnProcessor: def __init__(self): self.attntion_probs = None @@ -145,7 +146,7 @@ def __init__( text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, + scheduler: KarrasDiffusionSchedulers, requires_safety_checker: bool = True, ): super().__init__() @@ -235,7 +236,7 @@ def new_forward(self, hidden_states, *args, **kwargs): del module.attn1.old_forward return cached_hidden_states - + def _encode_prompt( self, prompt, @@ -397,7 +398,6 @@ def _encode_prompt( return prompt_embeds - def unet_forward_with_cached_hidden_states( self, z_all, @@ -513,7 +513,7 @@ def check_inputs( raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}") @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) + @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Optional[Union[str, List[str]]] = "", @@ -534,8 +534,8 @@ def __call__( neg_bottleneck_scale: float = 1.0, output_type: Optional[str] = "pil", latents: Optional[torch.FloatTensor] = None, - pos_weights: Optional[tuple] = (.8,.8), - neg_weights: Optional[tuple] = (.5,.5), + pos_weights: Optional[tuple] = (0.8, 0.8), + neg_weights: Optional[tuple] = (0.5, 0.5), ): r""" Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary @@ -569,7 +569,7 @@ def __call__( expense of slower inference. Examples: - + Returns: [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: When returning a tuple, the first element is a list with the generated images, and the second element is a list of `bool`s denoting whether the corresponding @@ -596,11 +596,11 @@ def __call__( ) if isinstance(prompt, str) and prompt is not None: - batch_size = 1 + pass elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) + len(prompt) else: - batch_size = None + pass prompt = [prompt] * num_images @@ -611,9 +611,10 @@ def __call__( else: assert len(negative_prompt) == num_images - do_classifier_free_guidance = guidance_scale > 1. + do_classifier_free_guidance = guidance_scale > 1.0 (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self._encode_prompt( - prompt, device, + prompt, + device, num_images, do_classifier_free_guidance, negative_prompt, @@ -712,6 +713,5 @@ def image_to_tensor(self, image: Union[str, Image.Image], dtype): image = Image.open(image) if not image.mode == "RGB": image = image.convert("RGB") - image = self.image_processor.preprocess(image,height=512,width=512)[0] + image = self.image_processor.preprocess(image, height=512, width=512)[0] return image.type(dtype) - From 83f744bfb17d1d006fcb8df9c8648744bbd5fe80 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Mon, 21 Aug 2023 13:12:38 +0530 Subject: [PATCH 79/98] doc fix --- docs/source/en/using-diffusers/controlling_generation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/controlling_generation.md b/docs/source/en/using-diffusers/controlling_generation.md index 2c3d79530ef0..b2dded7826c9 100644 --- a/docs/source/en/using-diffusers/controlling_generation.md +++ b/docs/source/en/using-diffusers/controlling_generation.md @@ -236,7 +236,7 @@ See [here](../api/pipelines/stable_diffusion/adapter) for more information on ho [Paper](https://arxiv.org/abs/2307.10159) -[DiffEdit](../api/pipelines/fabric) is a training-free +[Fabric](../api/pipelines/fabric) is a training-free approach applicable to a wide range of popular diffusion models, which exploits the self-attention layer present in the most widely used architectures to condition the diffusion process on a set of feedback images. From bda8c53b23ba0e8e32b0100312200b9a9898a967 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Mon, 21 Aug 2023 17:24:28 +0530 Subject: [PATCH 80/98] more fixes --- .../pipelines/fabric/pipeline_fabric.py | 97 ++++++------------- 1 file changed, 31 insertions(+), 66 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index bae9e2a3aacd..921fc6f84fec 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2023 FABRIC authors and the HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -183,60 +183,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - def initialize_prompts(self, prompts: List[str], device): - # Breaking into individual prompts feels memory efficient - prompt_embed_list = [] - for prompt in prompts: - prompt_tokens = self.tokenizer( - prompt, - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - attention_mask = ( - prompt_tokens.attention_mask.to(device) - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ) - else None - ) - - prompt_embd = self.text_encoder( - input_ids=prompt_tokens.input_ids.to(device), - attention_mask=attention_mask, - ).last_hidden_state - - prompt_embed_list.append(prompt_embd) - - all_prompt_embed = torch.cat(prompt_embed_list, dim=0) - return all_prompt_embed - - def get_unet_hidden_states(self, z_all, t, prompt_embd): - cached_hidden_states = [] - for module in self.unet.modules(): - if isinstance(module, BasicTransformerBlock): - - def new_forward(self, hidden_states, *args, **kwargs): - cached_hidden_states.append(hidden_states.clone().detach().cpu()) - return self.old_forward(hidden_states, *args, **kwargs) - - module.attn1.old_forward = module.attn1.forward - module.attn1.forward = new_forward.__get__(module.attn1) - - # run forward pass to cache hidden states, output can be discarded - _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) - - # restore original forward pass - for module in self.unet.modules(): - if isinstance(module, BasicTransformerBlock): - module.attn1.forward = module.attn1.old_forward - del module.attn1.old_forward - - return cached_hidden_states - + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion._encode_prompt def _encode_prompt( self, prompt, @@ -333,9 +280,6 @@ def _encode_prompt( prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -348,7 +292,7 @@ def _encode_prompt( f" {type(prompt)}." ) elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] + uncond_tokens = [negative_prompt] + [""] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" @@ -356,7 +300,7 @@ def _encode_prompt( " the batch size of `prompt`." ) else: - uncond_tokens = negative_prompt + uncond_tokens = negative_prompt + [""] # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): @@ -384,20 +328,40 @@ def _encode_prompt( if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds]) return prompt_embeds + def get_unet_hidden_states(self, z_all, t, prompt_embd): + cached_hidden_states = [] + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + + def new_forward(self, hidden_states, *args, **kwargs): + cached_hidden_states.append(hidden_states.clone().detach().cpu()) + return self.old_forward(hidden_states, *args, **kwargs) + + module.attn1.old_forward = module.attn1.forward + module.attn1.forward = new_forward.__get__(module.attn1) + + # run forward pass to cache hidden states, output can be discarded + _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd) + + # restore original forward pass + for module in self.unet.modules(): + if isinstance(module, BasicTransformerBlock): + module.attn1.forward = module.attn1.old_forward + del module.attn1.old_forward + + return cached_hidden_states + def unet_forward_with_cached_hidden_states( self, z_all, @@ -612,7 +576,8 @@ def __call__( assert len(negative_prompt) == num_images do_classifier_free_guidance = guidance_scale > 1.0 - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self._encode_prompt( + + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts( prompt, device, num_images, From ffe80d028a47ea2fdf5f387b3822f1d5e2a5f2db Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 01:29:54 +0530 Subject: [PATCH 81/98] no more dependent on 512*512 --- .../pipelines/fabric/pipeline_fabric.py | 54 ++++++++++++++----- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 921fc6f84fec..924a9b499d9d 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -28,6 +28,7 @@ from ...utils import ( deprecate, logging, + randn_tensor, replace_example_docstring, ) from ..pipeline_utils import DiffusionPipeline @@ -397,7 +398,6 @@ def new_forward( device, dtype = hidden_states.device, hidden_states.dtype weights = torch.ones(batch_size, d_model, device=device, dtype=dtype) - print(weights.shape) out_pos = self.old_forward(hidden_states) out_neg = self.old_forward(hidden_states) @@ -447,11 +447,12 @@ def new_forward( return out - def preprocess_feedback_images(self, images, vae, device, dtype) -> torch.tensor: - images_t = [self.image_to_tensor(img, dtype) for img in images] + def preprocess_feedback_images(self, images, vae, dim, device, dtype) -> torch.tensor: + images_t = [self.image_to_tensor(img, dim, dtype) for img in images] images_t = torch.stack(images_t).to(device) latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() - return latents + + return torch.cat([latents], dim=0) def check_inputs( self, @@ -459,6 +460,8 @@ def check_inputs( negative_prompt=None, liked=None, disliked=None, + height=None, + width=None, ): if prompt is None: raise ValueError("Provide `prompt`. Cannot leave both `prompt` undefined.") @@ -476,6 +479,12 @@ def check_inputs( if disliked is not None and not isinstance(disliked, list): raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}") + if height is not None and not isinstance(height, int): + raise ValueError(f"`height` has to be of type `int` but is {type(height)}") + + if width is not None and not isinstance(width, int): + raise ValueError(f"`width` has to be of type `int` but is {type(width)}") + @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( @@ -485,6 +494,8 @@ def __call__( liked: Optional[Union[List[str], List[Image.Image]]] = [], disliked: Optional[Union[List[str], List[Image.Image]]] = [], generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, return_dict: bool = True, num_images: int = 4, guidance_scale: float = 7.0, @@ -498,8 +509,6 @@ def __call__( neg_bottleneck_scale: float = 1.0, output_type: Optional[str] = "pil", latents: Optional[torch.FloatTensor] = None, - pos_weights: Optional[tuple] = (0.8, 0.8), - neg_weights: Optional[tuple] = (0.5, 0.5), ): r""" Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary @@ -520,6 +529,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), can be int. to make generation deterministic. + height (`int`, *optional*, defaults to 512): + height of the generated image + width (`int`, *optional*, defaults to 512): + width of the generated image num_images (`int`, *optional*, defaults to 1): The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 7.5): @@ -531,6 +544,8 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + output_type (`str`, *optional*, defaults to "pil"): + defines the output type of generated image supports "np","pil" Examples: @@ -545,16 +560,28 @@ def __call__( device = self._execution_device dtype = self.unet.dtype - - latent_noise = torch.randn(num_images, 4, 64, 64, device=device, dtype=dtype) + if generator is None: + generator = torch.manual_seed(42) + shape = [ + num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ] + latent_noise = randn_tensor( + shape, + generator=generator, + device=device, + dtype=dtype, + ) positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, device, dtype) + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype) if liked and len(liked) > 0 else torch.tensor([], device=device, dtype=dtype) ) negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, device, dtype) + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype) if disliked and len(disliked) > 0 else torch.tensor([], device=device, dtype=dtype) ) @@ -577,7 +604,7 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self.initialize_prompts( + (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self._encode_prompt( prompt, device, num_images, @@ -589,7 +616,6 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order @@ -670,7 +696,7 @@ def __call__( return FabricPipelineOutput(imgs, False) - def image_to_tensor(self, image: Union[str, Image.Image], dtype): + def image_to_tensor(self, image: Union[str, Image.Image], dim: tuple, dtype): """ Convert latent PIL image to a torch tensor for further processing. """ @@ -678,5 +704,5 @@ def image_to_tensor(self, image: Union[str, Image.Image], dtype): image = Image.open(image) if not image.mode == "RGB": image = image.convert("RGB") - image = self.image_processor.preprocess(image, height=512, width=512)[0] + image = self.image_processor.preprocess(image, height=dim[0], width=dim[1])[0] return image.type(dtype) From 4a1859af804251a690f8fb2caf3c13b983a5e5bc Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 01:32:02 +0530 Subject: [PATCH 82/98] update docs --- docs/source/en/api/pipelines/fabric.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index 183ec49e5069..9a93ba23563b 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -24,11 +24,8 @@ The original codebase can be found here: - *FABRIC*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) Available Checkpoints are: -- *dreamlike-photoreal-2.0 (512x512 resolution)* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) +- *dreamlike-photoreal-2.0* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) -[[autodoc]] FabricPipeline - - all - - __call__ ## Usage Example @@ -75,9 +72,13 @@ disliked = ["path/to/image"] image = pipe(prompt=prompt, negative_prompt=neg_prompt,liked=liked,disliked=disliked).images[0] ``` -Let's have a look at the images +Let's have a look at the images (*512X512*) | Without Feedback | With Feedback (1st image) | |---------------------|---------------------| | ![Image 1](https://drive.google.com/uc?export=view&id=12wxbikt7834eRTK40legR5PtJmFLNH34) | ![Feedback Image 1](https://drive.google.com/uc?export=view&id=1YcFPDHSRr2OE3hy-5lvr8An21Jum85D5) | + +[[autodoc]] FabricPipeline + - all + - __call__ From 34ea6df0b1af7cfd7b932e10ff19b7cd6ce41cd8 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 02:00:49 +0530 Subject: [PATCH 83/98] fixes --- .../pipelines/fabric/pipeline_fabric.py | 4 ++-- tests/pipelines/fabric/test_fabric.py | 20 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index 924a9b499d9d..d5e773acce4b 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -562,12 +562,12 @@ def __call__( dtype = self.unet.dtype if generator is None: generator = torch.manual_seed(42) - shape = [ + shape = ( num_images, self.unet.config.in_channels, height // self.vae_scale_factor, width // self.vae_scale_factor, - ] + ) latent_noise = randn_tensor( shape, generator=generator, diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 4e88fad71c07..69df600cdbc8 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -116,6 +116,8 @@ def get_dummy_inputs(self, device, seed=0): "num_images": 1, "num_inference_steps": 2, "output_type": "np", + "height":128, + "width":128, } return inputs @@ -158,10 +160,11 @@ def test_fabric_w_fb(self): assert image.shape == (1, 128, 128, 3) print(image_slice) expected_slice = np.array( - [0.77254902, 0.77647059, 0.78431373, 0.8, 0.78823529, 0.79607843, 0.78823529, 0.78823529, 0.78039216] - ) + [[0.46259943, 0.45826188, 0.4768875], + [0.4880805, 0.46087098, 0.5162324], + [0.5224824, 0.5005106, 0.46634308]]).flatten() - assert np.abs(image_slice.flatten() / 128 - expected_slice).max() < 1e-2 + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @require_torch_gpu @@ -175,12 +178,12 @@ def tearDown(self): def test_fabric(self): generator = torch.manual_seed(0) - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0",torch_dtype=torch.float16) pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" - images = pipe(prompt, random_seed=generator).images + images = pipe(prompt, random_seed=generator).images[0] for word, image in zip(prompt, images): expected_image = load_numpy( @@ -191,17 +194,18 @@ def test_fabric(self): def test_fabric_feedback(self): generator = torch.manual_seed(0) - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0") + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0",torch_dtype=float16) pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" - images = pipe(prompt, random_seed=generator).images + images = pipe(prompt, random_seed=generator).images[0] liked = [images] - images = pipe(prompt, random_seed=generator, liked=liked).images + images = pipe(prompt, random_seed=generator, liked=liked).images[0] for word, image in zip(prompt, images): expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" ) assert np.abs((expected_image - np.array(image)).max()) < 1e-2 + From 2fe64ff04c3c7da32475a9ba3e05963f55861b60 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 17:31:12 +0530 Subject: [PATCH 84/98] test passing --- .../pipelines/fabric/pipeline_fabric.py | 95 +++++++++++-------- tests/pipelines/fabric/test_fabric.py | 59 +++++++----- 2 files changed, 90 insertions(+), 64 deletions(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index d5e773acce4b..ca52d011d36e 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -229,11 +229,12 @@ def _encode_prompt( if prompt is not None and isinstance(prompt, str): batch_size = 1 + prompt = [prompt] elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - + # prompt = [prompt] * num_images_per_prompt if prompt_embeds is None: # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): @@ -281,7 +282,9 @@ def _encode_prompt( prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape - + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] @@ -329,15 +332,23 @@ def _encode_prompt( if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds.shape[1] + seq_len = negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + negative_prompt_embeds, null_embed = ( + negative_prompt_embeds[:batch_size], + negative_prompt_embeds[batch_size:], + ) + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + print(null_embed.shape) + print(negative_prompt_embeds.shape) + print(prompt_embeds.shape) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds]) - + prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, null_embed]) return prompt_embeds def get_unet_hidden_states(self, z_all, t, prompt_embd): @@ -447,10 +458,10 @@ def new_forward( return out - def preprocess_feedback_images(self, images, vae, dim, device, dtype) -> torch.tensor: + def preprocess_feedback_images(self, images, vae, dim, device, dtype, generator) -> torch.tensor: images_t = [self.image_to_tensor(img, dim, dtype) for img in images] images_t = torch.stack(images_t).to(device) - latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample() + latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample(generator) return torch.cat([latents], dim=0) @@ -560,59 +571,64 @@ def __call__( device = self._execution_device dtype = self.unet.dtype - if generator is None: - generator = torch.manual_seed(42) + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + shape = ( - num_images, + batch_size * num_images, self.unet.config.in_channels, height // self.vae_scale_factor, width // self.vae_scale_factor, ) latent_noise = randn_tensor( shape, - generator=generator, device=device, dtype=dtype, + generator=generator, ) positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype) + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) if liked and len(liked) > 0 - else torch.tensor([], device=device, dtype=dtype) + else torch.tensor( + [], + device=device, + dtype=dtype, + ) ) negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype) + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) if disliked and len(disliked) > 0 - else torch.tensor([], device=device, dtype=dtype) + else torch.tensor( + [], + device=device, + dtype=dtype, + ) ) - if isinstance(prompt, str) and prompt is not None: - pass - elif isinstance(prompt, list) and prompt is not None: - len(prompt) - else: - pass - - prompt = [prompt] * num_images - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] * num_images - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == num_images - - do_classifier_free_guidance = guidance_scale > 1.0 + do_classifier_free_guidance = guidance_scale > 0.1 - (cond_prompt_embs, uncond_prompt_embs, null_prompt_emb) = self._encode_prompt( + (prompt_embs, null_prompt_emb) = self._encode_prompt( prompt, device, num_images, do_classifier_free_guidance, negative_prompt, - ).split([num_images, num_images, 1]) + ).split([(num_images + num_images) * batch_size, 1]) - batched_prompt_embd = torch.cat([cond_prompt_embs, uncond_prompt_embs], dim=0) + batched_prompt_embd = torch.cat([prompt_embs], dim=0) self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps @@ -678,18 +694,21 @@ def __call__( cached_neg_hiddens=cached_neg_hs, pos_weights=pos_ws, neg_weights=neg_ws, - ).sample + )[0] noise_cond, noise_uncond = unet_out.chunk(2) guidance = noise_cond - noise_uncond noise_pred = noise_uncond + guidance_scale * guidance - latent_noise = self.scheduler.step(noise_pred, t, latent_noise).prev_sample + latent_noise = self.scheduler.step(noise_pred, t, latent_noise)[0] if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): pbar.update() y = self.vae.decode(latent_noise / self.vae.config.scaling_factor, return_dict=False)[0] - imgs = self.image_processor.postprocess(y, output_type=output_type) + imgs = self.image_processor.postprocess( + y, + output_type=output_type, + ) if not return_dict: return imgs diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/fabric/test_fabric.py index 69df600cdbc8..d3bfadd35c14 100644 --- a/tests/pipelines/fabric/test_fabric.py +++ b/tests/pipelines/fabric/test_fabric.py @@ -27,9 +27,10 @@ FabricPipeline, UNet2DConditionModel, ) -from diffusers.utils import load_numpy, slow +from diffusers.utils import load_numpy from diffusers.utils.testing_utils import ( enable_full_determinism, + nightly, require_torch_gpu, ) @@ -44,10 +45,8 @@ class FabricPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = FabricPipeline params = TEXT_TO_IMAGE_PARAMS - { "negative_prompt_embeds", - "width", "prompt_embeds", "cross_attention_kwargs", - "height", "callback", "callback_steps", } @@ -108,7 +107,10 @@ def get_dummy_components(self): return components def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) inputs = { "prompt": "A painting of a squirrel eating a burger", "negative_prompt": "lowres, dark, cropped", @@ -116,8 +118,8 @@ def get_dummy_inputs(self, device, seed=0): "num_images": 1, "num_inference_steps": 2, "output_type": "np", - "height":128, - "width":128, + "height": 128, + "width": 128, } return inputs @@ -160,15 +162,18 @@ def test_fabric_w_fb(self): assert image.shape == (1, 128, 128, 3) print(image_slice) expected_slice = np.array( - [[0.46259943, 0.45826188, 0.4768875], - [0.4880805, 0.46087098, 0.5162324], - [0.5224824, 0.5005106, 0.46634308]]).flatten() + [ + [0.46259943, 0.45826188, 0.4768875], + [0.4880805, 0.46087098, 0.5162324], + [0.5224824, 0.5005106, 0.46634308], + ] + ).flatten() assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 +@nightly @require_torch_gpu -@slow class FABRICPipelineIntegrationTests(unittest.TestCase): def tearDown(self): super().tearDown() @@ -178,34 +183,36 @@ def tearDown(self): def test_fabric(self): generator = torch.manual_seed(0) - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0",torch_dtype=torch.float16) + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16) pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" + images = pipe(prompt, output_type="np", generator=generator, num_inference_steps=2).images - images = pipe(prompt, random_seed=generator).images[0] + images = images[0, -3:, -3:, -1].flatten() - for word, image in zip(prompt, images): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" - ) - assert np.abs((expected_image - np.array(image)).max()) < 1e-2 + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" + ) + + self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) def test_fabric_feedback(self): generator = torch.manual_seed(0) - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0",torch_dtype=float16) + pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16) pipe.to("cuda") prompt = "a photograph of an astronaut riding a horse" - images = pipe(prompt, random_seed=generator).images[0] + images = pipe(prompt, output_type="pil", generator=generator, num_inference_steps=2).images - liked = [images] - images = pipe(prompt, random_seed=generator, liked=liked).images[0] + liked = [images[0]] + images = pipe(prompt, output_type="np", generator=generator, num_inference_steps=2, liked=liked).images - for word, image in zip(prompt, images): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_w_feedback.npy" - ) - assert np.abs((expected_image - np.array(image)).max()) < 1e-2 + images = images[0, -3:, -3:, -1].flatten() + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" + ) + self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) From ac517d3e17cba1d4c7956e6151325bd59d4149b9 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 17:32:19 +0530 Subject: [PATCH 85/98] remove comment --- src/diffusers/pipelines/fabric/pipeline_fabric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/fabric/pipeline_fabric.py index ca52d011d36e..bc5d1d095bd4 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/fabric/pipeline_fabric.py @@ -64,7 +64,7 @@ def __call__( hidden_states, encoder_hidden_states=None, attention_mask=None, - weights=None, # shape: (batch_size, sequence_length) + weights=None, lora_scale=1.0, ): batch_size, sequence_length, _ = ( From 882f1a1f4189c9f4edb731399ddc98f6cf2158c0 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 21:02:39 +0530 Subject: [PATCH 86/98] fixes and migration --- docs/source/en/api/pipelines/fabric.md | 2 + src/diffusers/pipelines/__init__.py | 2 +- .../pipelines/stable_diffusion/README.md | 51 +++++++++++++++++++ .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_fabric.py | 9 ++-- tests/pipelines/fabric/__init__.py | 0 .../test_fabric.py | 0 7 files changed, 58 insertions(+), 7 deletions(-) rename src/diffusers/pipelines/{fabric => stable_diffusion}/pipeline_fabric.py (99%) delete mode 100644 tests/pipelines/fabric/__init__.py rename tests/pipelines/{fabric => stable_diffusion}/test_fabric.py (100%) diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index 9a93ba23563b..e0b08e75bfd1 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -25,6 +25,8 @@ The original codebase can be found here: Available Checkpoints are: - *dreamlike-photoreal-2.0* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) +- *stable-diffusion-v1.5* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) +- *stable-diffusion-2-1* [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results) ## Usage Example diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index a61b601c7e7c..29678450084e 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -60,7 +60,6 @@ IFPipeline, IFSuperResolutionPipeline, ) - from .fabric import FabricPipeline from .kandinsky import ( KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, @@ -88,6 +87,7 @@ from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline from .stable_diffusion import ( CycleDiffusionPipeline, + FabricPipeline, StableDiffusionAttendAndExcitePipeline, StableDiffusionDepth2ImgPipeline, StableDiffusionDiffEditPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md index 66df9a811afb..13d5a72dcb90 100644 --- a/src/diffusers/pipelines/stable_diffusion/README.md +++ b/src/diffusers/pipelines/stable_diffusion/README.md @@ -174,3 +174,54 @@ image = pipe( image.save("black_to_blue.png") ``` + +### FABRIC using Stable Diffusion + +```python +import requests +import torch +from PIL import Image +from io import BytesIO + +from diffusers import FabricPipeline + +# load the pipeline +# make sure you're logged in with `huggingface-cli login` +model_id_or_path = "runwayml/stable-diffusion-v1-5" +#can also be used with dreamlike-art/dreamlike-photoreal-2.0 +pipe = FabricPipeline.from_pretrained(model_id_or_path, ).to("cuda") + +# let's specify a prompt +prompt = "An astronaut riding an elephant" +negative_prompt = "lowres, cropped" + +# call the pipeline +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + num_inference_steps=20, + generator=torch.manual_seed(12) +).images[0] + +image.save("horse_to_elephant.jpg") + +# let's try another example with feedback +url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png" +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") + +prompt = "photo, A blue colored car, fish eye" +liked = [init_image] +## same goes with disliked + +# call the pipeline +torch.manual_seed(0) +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + liked = liked, + num_inference_steps=20, +).images[0] + +image.save("black_to_blue.png") +``` diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 1cef019e06a9..81717f24ef93 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -43,6 +43,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: from .pipeline_cycle_diffusion import CycleDiffusionPipeline + from .pipeline_fabric import FabricPipeline from .pipeline_stable_diffusion import StableDiffusionPipeline from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline diff --git a/src/diffusers/pipelines/fabric/pipeline_fabric.py b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py similarity index 99% rename from src/diffusers/pipelines/fabric/pipeline_fabric.py rename to src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py index bc5d1d095bd4..f39ffd57a16a 100644 --- a/src/diffusers/pipelines/fabric/pipeline_fabric.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py @@ -32,7 +32,7 @@ replace_example_docstring, ) from ..pipeline_utils import DiffusionPipeline -from . import FabricPipelineOutput +from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -184,7 +184,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion._encode_prompt + # Copied from diffusers.pipelines.stable_diffusion.pipelines.stable_diffusion._encode_prompt def _encode_prompt( self, prompt, @@ -341,9 +341,6 @@ def _encode_prompt( ) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - print(null_embed.shape) - print(negative_prompt_embeds.shape) - print(prompt_embeds.shape) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -713,7 +710,7 @@ def __call__( if not return_dict: return imgs - return FabricPipelineOutput(imgs, False) + return StableDiffusionPipelineOutput(imgs, False) def image_to_tensor(self, image: Union[str, Image.Image], dim: tuple, dtype): """ diff --git a/tests/pipelines/fabric/__init__.py b/tests/pipelines/fabric/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/fabric/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py similarity index 100% rename from tests/pipelines/fabric/test_fabric.py rename to tests/pipelines/stable_diffusion/test_fabric.py From b87efcc093b9a62357a756ce0686080d4ed962eb Mon Sep 17 00:00:00 2001 From: shauray8 Date: Tue, 22 Aug 2023 21:10:01 +0530 Subject: [PATCH 87/98] simpler tests --- tests/pipelines/stable_diffusion/test_fabric.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py index d3bfadd35c14..6b3bc798ec4d 100644 --- a/tests/pipelines/stable_diffusion/test_fabric.py +++ b/tests/pipelines/stable_diffusion/test_fabric.py @@ -27,7 +27,6 @@ FabricPipeline, UNet2DConditionModel, ) -from diffusers.utils import load_numpy from diffusers.utils.testing_utils import ( enable_full_determinism, nightly, @@ -136,13 +135,12 @@ def test_fabric(self): output = pipe(**inputs) image = output.images image_slice = image[0, -3:, -3:, -1] - print(image_slice.flatten()) assert image.shape == (1, 128, 128, 3) expected_slice = np.array( [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] ) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) def test_fabric_w_fb(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator @@ -160,7 +158,6 @@ def test_fabric_w_fb(self): image_slice = output.images[0, -3:, -3:, -1] assert image.shape == (1, 128, 128, 3) - print(image_slice) expected_slice = np.array( [ [0.46259943, 0.45826188, 0.4768875], @@ -169,7 +166,7 @@ def test_fabric_w_fb(self): ] ).flatten() - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) @nightly @@ -191,8 +188,8 @@ def test_fabric(self): images = images[0, -3:, -3:, -1].flatten() - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" + expected_image = np.array( + [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] ) self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) @@ -211,8 +208,8 @@ def test_fabric_feedback(self): images = images[0, -3:, -3:, -1].flatten() - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/fabric_wo_feedback.npy" + expected_image = np.array( + [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] ) self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) From 330444005f5a7370d5670e1f35a4a9a097a3f574 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 23 Aug 2023 00:57:49 +0530 Subject: [PATCH 88/98] doc changes --- docs/source/en/api/pipelines/fabric.md | 69 +++------------- src/diffusers/pipelines/fabric/__init__.py | 38 --------- .../stable_diffusion/pipeline_fabric.py | 80 +++++++++++-------- 3 files changed, 58 insertions(+), 129 deletions(-) delete mode 100644 src/diffusers/pipelines/fabric/__init__.py diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index e0b08e75bfd1..9478a9bcc720 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -10,69 +10,17 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# FabricPipeline +# FABRIC [FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf -FABRIC is training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models, created by the researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion and finetune them on the basis of feedback. +FABRIC is a training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models. It is created by researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] can generate photo-realistic images given any text input using Stable Diffusion. -The abstract of the paper is the following: +The abstract from the paper is: *In an era where visual content generation is increasingly driven by machine learning, the integration of human feedback into generative models presents significant opportunities for enhancing user experience and output quality. This study explores strategies for incorporating iterative human feedback into the generative process of diffusion-based text-to-image models. We propose FABRIC, a training-free approach applicable to a wide range of popular diffusion models, which exploits the self-attention layer present in the most widely used architectures to condition the diffusion process on a set of feedback images. To ensure a rigorous assessment of our approach, we introduce a comprehensive evaluation methodology, offering a robust mechanism to quantify the performance of generative visual models that integrate human feedback. We show that generation results improve over multiple rounds of iterative feedback through exhaustive analysis, implicitly optimizing arbitrary user preferences. The potential applications of these findings extend to fields such as personalized content creation and customization* -The original codebase can be found here: -- *FABRIC*: [sd-fabric/fabric](https://github.com/sd-fabric/fabric) - -Available Checkpoints are: -- *dreamlike-photoreal-2.0* [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0) -- *stable-diffusion-v1.5* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) -- *stable-diffusion-2-1* [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results) - - -## Usage Example - -Before using Fabric make sure to have `transformers`, `accelerate`, `huggingface_hub` installed. -You can install the libraries as follows: - -``` -pip install transformers -pip install accelerate -pip install huggingface_hub -``` - -### Text-to-Image - -You can use Fabric as follows for *text-to-image*: - -```py -from diffusers import FabricPipeline -import torch - -model_id = "dreamlike-art/dreamlike-photoreal-2.0" -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) -pipe = pipe.to("cuda") - -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" -neg_prompt = "bad anatomy, cropped, lowres" -image = pipe(prompt=prompt, negative_prompt=neg_prompt).images[0] -``` - -You can use Fabric as follows for *text-to-image-with-feedback*: - -```py -from diffusers import FabricPipeline -import torch - -model_id = "dreamlike-art/dreamlike-photoreal-2.0" -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) -pipe = pipe.to("cuda") - -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" -neg_prompt = "bad anatomy, cropped, lowres" -liked = ["path/to/image"] -disliked = ["path/to/image"] -image = pipe(prompt=prompt, negative_prompt=neg_prompt,liked=liked,disliked=disliked).images[0] -``` +The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results) Let's have a look at the images (*512X512*) @@ -80,7 +28,16 @@ Let's have a look at the images (*512X512*) |---------------------|---------------------| | ![Image 1](https://drive.google.com/uc?export=view&id=12wxbikt7834eRTK40legR5PtJmFLNH34) | ![Feedback Image 1](https://drive.google.com/uc?export=view&id=1YcFPDHSRr2OE3hy-5lvr8An21Jum85D5) | + + +Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently! + + [[autodoc]] FabricPipeline - all - __call__ + +## StableDiffusionPipelineOutput + +[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/fabric/__init__.py b/src/diffusers/pipelines/fabric/__init__.py deleted file mode 100644 index a202d7fbf512..000000000000 --- a/src/diffusers/pipelines/fabric/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -from dataclasses import dataclass -from typing import List, Optional, Union - -import numpy as np -import PIL - -from ...utils import ( - BaseOutput, - OptionalDependencyNotAvailable, - is_torch_available, -) - - -@dataclass -class FabricPipelineOutput(BaseOutput): - """ - Output class for Stable Diffusion pipelines. - - Args: - images (`List[PIL.Image.Image]` or `np.ndarray`) - List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, - num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. - nsfw_content_detected (`Optional[List[bool]]`) - List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, or `None` if safety checking could not be performed. - """ - - images: Union[List[PIL.Image.Image], np.ndarray] - nsfw_content_detected: Optional[List[bool]] - - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 -else: - from .pipeline_fabric import FabricPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py index f39ffd57a16a..32a422c93806 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py @@ -118,27 +118,25 @@ def __call__( class FabricPipeline(DiffusionPipeline): r""" Pipeline for text-to-image generation using Stable Diffusion and conditioning the results using feedback images. - - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the - library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. - text_encoder ([`CLIPTextModel`]): - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. scheduler ([`EulerAncestralDiscreteScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. - Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. """ def __init__( @@ -508,6 +506,7 @@ def __call__( num_images: int = 4, guidance_scale: float = 7.0, num_inference_steps: int = 20, + output_type: Optional[str] = "pil", feedback_start_ratio: float = 0.33, feedback_end_ratio: float = 0.66, min_weight: float = 0.05, @@ -515,52 +514,63 @@ def __call__( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - output_type: Optional[str] = "pil", latents: Optional[torch.FloatTensor] = None, ): r""" - Function invoked when calling the pipeline for generation. Generate a trajectory of images with binary - feedback. The feedback can be given as a list of liked and disliked images. + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` instead. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). liked (`List[Image.Image]` or `List[str]`, *optional*): - Liked enables feedback through images, encourages images with liked features. + Encourages images with liked features. disliked (`List[Image.Image]` or `List[str]`, *optional*): - Disliked enables feedback through images, discourages images with disliked features. + Discourages images with disliked features. generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html), - can be int. to make generation deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. height (`int`, *optional*, defaults to 512): - height of the generated image + Height of the generated image width (`int`, *optional*, defaults to 512): - width of the generated image - num_images (`int`, *optional*, defaults to 1): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_inference_steps (`int`, *optional*, defaults to 50): + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - output_type (`str`, *optional*, defaults to "pil"): - defines the output type of generated image supports "np","pil" + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. Examples: Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: When returning a tuple, the first element is a list - with the generated images, and the second element is a list of `bool`s denoting whether the corresponding - generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. """ From 4a2b84aad47273d005d4e06dec574d75b855ad12 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 23 Aug 2023 10:30:20 +0530 Subject: [PATCH 89/98] green CI --- src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py index 32a422c93806..076666d68377 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py @@ -182,7 +182,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipelines.stable_diffusion._encode_prompt + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, prompt, From b9002ee73ecca94c98605f3025d8586ac8feb5fd Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 23 Aug 2023 14:22:24 +0530 Subject: [PATCH 90/98] changes --- .../stable_diffusion/pipeline_fabric.py | 44 +- tests/pipelines/stable_diffusion/ff | 1377 +++++++++++++++++ .../pipelines/stable_diffusion/test_fabric.py | 26 + 3 files changed, 1434 insertions(+), 13 deletions(-) create mode 100644 tests/pipelines/stable_diffusion/ff diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py index 076666d68377..535f403137ca 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py @@ -227,12 +227,11 @@ def _encode_prompt( if prompt is not None and isinstance(prompt, str): batch_size = 1 - prompt = [prompt] elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - # prompt = [prompt] * num_images_per_prompt + if prompt_embeds is None: # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): @@ -283,6 +282,7 @@ def _encode_prompt( # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] @@ -294,7 +294,7 @@ def _encode_prompt( f" {type(prompt)}." ) elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] + [""] + uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" @@ -302,7 +302,7 @@ def _encode_prompt( " the batch size of `prompt`." ) else: - uncond_tokens = negative_prompt + [""] + uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): @@ -333,17 +333,15 @@ def _encode_prompt( seq_len = negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - negative_prompt_embeds, null_embed = ( - negative_prompt_embeds[:batch_size], - negative_prompt_embeds[batch_size:], - ) + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, null_embed]) + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + return prompt_embeds def get_unet_hidden_states(self, z_all, t, prompt_embd): @@ -587,7 +585,7 @@ def __call__( raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] + negative_prompt = negative_prompt elif isinstance(negative_prompt, list): negative_prompt = negative_prompt else: @@ -627,15 +625,35 @@ def __call__( do_classifier_free_guidance = guidance_scale > 0.1 - (prompt_embs, null_prompt_emb) = self._encode_prompt( + (prompt_neg_embs, prompt_pos_embs) = self._encode_prompt( prompt, device, num_images, do_classifier_free_guidance, negative_prompt, - ).split([(num_images + num_images) * batch_size, 1]) + ).split([num_images * batch_size, num_images * batch_size]) + + batched_prompt_embd = torch.cat([prompt_pos_embs, prompt_neg_embs], dim=0) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = null_tokens.attention_mask.to(device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(device), + attention_mask=attention_mask, + ).last_hidden_state - batched_prompt_embd = torch.cat([prompt_embs], dim=0) + null_prompt_emb = null_prompt_emb.to(device=device, dtype=dtype) self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/tests/pipelines/stable_diffusion/ff b/tests/pipelines/stable_diffusion/ff new file mode 100644 index 000000000000..90517feb020b --- /dev/null +++ b/tests/pipelines/stable_diffusion/ff @@ -0,0 +1,1377 @@ +============================= test session starts ============================== +platform linux -- Python 3.10.6, pytest-7.4.0, pluggy-1.2.0 +rootdir: /home/fractal/files/diffusers +plugins: hydra-core-1.3.2, xdist-3.3.1, anyio-3.7.1 +collected 22 items + +test_fabric.py FF.FFF + +=================================== FAILURES =================================== +_________ FabricPipelineFastTests.test_attention_slicing_forward_pass __________ + +self = +expected_max_diff = 0.001 + + def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3): +> self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff) + +../test_pipelines_common.py:650: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +../test_pipelines_common.py:664: in _test_attention_slicing_forward_pass + output_without_slicing = pipe(**inputs)[0] +/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context + return func(*args, **kwargs) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = FabricPipeline { + "_class_name": "FabricPipeline", + "_diffusers_version": "0.21.0.dev0", + "scheduler": [ + "diffu... ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} + +prompt = 'A painting of a squirrel eating a burger' +negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] +generator = , height = 128 +width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 +num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 +feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 +pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, + return_dict: bool = True, + num_images: int = 4, + guidance_scale: float = 7.0, + num_inference_steps: int = 20, + output_type: Optional[str] = "pil", + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = 0.8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + latents: Optional[torch.FloatTensor] = None, + ): + r""" + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Discourages images with disliked features. + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. + height (`int`, *optional*, defaults to 512): + Height of the generated image + width (`int`, *optional*, defaults to 512): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. + + Examples: + + Returns: + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. + + """ + + self.check_inputs(prompt, negative_prompt, liked, disliked) + + device = self._execution_device + dtype = self.unet.dtype + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + + shape = ( + batch_size * num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + latent_noise = randn_tensor( + shape, + device=device, + dtype=dtype, + generator=generator, + ) + + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) + if liked and len(liked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) + if disliked and len(disliked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + + do_classifier_free_guidance = guidance_scale > 0.1 + + prompt_embs = self._encode_prompt( + prompt, + device, + num_images, + do_classifier_free_guidance, + negative_prompt,) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + batched_prompt_embd = torch.cat([prompt_embs], dim=0) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + latent_noise = latent_noise * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with self.progress_bar(total=num_inference_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 + ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None +> print("shapehape",cached_pos_hs[0].shape) +E TypeError: 'NoneType' object is not subscriptable + +../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError +_______________________ FabricPipelineFastTests.test_cfg _______________________ + +self = + + def test_cfg(self): + sig = inspect.signature(self.pipeline_class.__call__) + + if "guidance_scale" not in sig.parameters: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + + inputs["guidance_scale"] = 1.0 +> out_no_cfg = pipe(**inputs)[0] + +../test_pipelines_common.py:797: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context + return func(*args, **kwargs) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = FabricPipeline { + "_class_name": "FabricPipeline", + "_diffusers_version": "0.21.0.dev0", + "scheduler": [ + "diffu... ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} + +prompt = 'A painting of a squirrel eating a burger' +negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] +generator = , height = 128 +width = 128, return_dict = True, num_images = 1, guidance_scale = 1.0 +num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 +feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 +pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, + return_dict: bool = True, + num_images: int = 4, + guidance_scale: float = 7.0, + num_inference_steps: int = 20, + output_type: Optional[str] = "pil", + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = 0.8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + latents: Optional[torch.FloatTensor] = None, + ): + r""" + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Discourages images with disliked features. + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. + height (`int`, *optional*, defaults to 512): + Height of the generated image + width (`int`, *optional*, defaults to 512): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. + + Examples: + + Returns: + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. + + """ + + self.check_inputs(prompt, negative_prompt, liked, disliked) + + device = self._execution_device + dtype = self.unet.dtype + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + + shape = ( + batch_size * num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + latent_noise = randn_tensor( + shape, + device=device, + dtype=dtype, + generator=generator, + ) + + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) + if liked and len(liked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) + if disliked and len(disliked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + + do_classifier_free_guidance = guidance_scale > 0.1 + + prompt_embs = self._encode_prompt( + prompt, + device, + num_images, + do_classifier_free_guidance, + negative_prompt,) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + batched_prompt_embd = torch.cat([prompt_embs], dim=0) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + latent_noise = latent_noise * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with self.progress_bar(total=num_inference_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 + ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None +> print("shapehape",cached_pos_hs[0].shape) +E TypeError: 'NoneType' object is not subscriptable + +../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError +____________ FabricPipelineFastTests.test_cpu_offload_forward_pass _____________ + +self = +expected_max_diff = 0.0001 + + @unittest.skipIf( + torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), + reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", + ) + def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) +> output_without_offload = pipe(**inputs)[0] + +../test_pipelines_common.py:688: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context + return func(*args, **kwargs) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = FabricPipeline { + "_class_name": "FabricPipeline", + "_diffusers_version": "0.21.0.dev0", + "scheduler": [ + "diffu... ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} + +prompt = 'A painting of a squirrel eating a burger' +negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] +generator = , height = 128 +width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 +num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 +feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 +pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, + return_dict: bool = True, + num_images: int = 4, + guidance_scale: float = 7.0, + num_inference_steps: int = 20, + output_type: Optional[str] = "pil", + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = 0.8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + latents: Optional[torch.FloatTensor] = None, + ): + r""" + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Discourages images with disliked features. + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. + height (`int`, *optional*, defaults to 512): + Height of the generated image + width (`int`, *optional*, defaults to 512): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. + + Examples: + + Returns: + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. + + """ + + self.check_inputs(prompt, negative_prompt, liked, disliked) + + device = self._execution_device + dtype = self.unet.dtype + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + + shape = ( + batch_size * num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + latent_noise = randn_tensor( + shape, + device=device, + dtype=dtype, + generator=generator, + ) + + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) + if liked and len(liked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) + if disliked and len(disliked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + + do_classifier_free_guidance = guidance_scale > 0.1 + + prompt_embs = self._encode_prompt( + prompt, + device, + num_images, + do_classifier_free_guidance, + negative_prompt,) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + batched_prompt_embd = torch.cat([prompt_embs], dim=0) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + latent_noise = latent_noise * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with self.progress_bar(total=num_inference_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 + ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None +> print("shapehape",cached_pos_hs[0].shape) +E TypeError: 'NoneType' object is not subscriptable + +../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError +__________ FabricPipelineFastTests.test_dict_tuple_outputs_equivalent __________ + +self = +expected_max_difference = 0.0001 + + def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + +> output = pipe(**self.get_dummy_inputs(torch_device))[0] + +../test_pipelines_common.py:518: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context + return func(*args, **kwargs) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = FabricPipeline { + "_class_name": "FabricPipeline", + "_diffusers_version": "0.21.0.dev0", + "scheduler": [ + "diffu... ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} + +prompt = 'A painting of a squirrel eating a burger' +negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] +generator = , height = 128 +width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 +num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 +feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 +pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, + return_dict: bool = True, + num_images: int = 4, + guidance_scale: float = 7.0, + num_inference_steps: int = 20, + output_type: Optional[str] = "pil", + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = 0.8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + latents: Optional[torch.FloatTensor] = None, + ): + r""" + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Discourages images with disliked features. + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. + height (`int`, *optional*, defaults to 512): + Height of the generated image + width (`int`, *optional*, defaults to 512): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. + + Examples: + + Returns: + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. + + """ + + self.check_inputs(prompt, negative_prompt, liked, disliked) + + device = self._execution_device + dtype = self.unet.dtype + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + + shape = ( + batch_size * num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + latent_noise = randn_tensor( + shape, + device=device, + dtype=dtype, + generator=generator, + ) + + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) + if liked and len(liked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) + if disliked and len(disliked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + + do_classifier_free_guidance = guidance_scale > 0.1 + + prompt_embs = self._encode_prompt( + prompt, + device, + num_images, + do_classifier_free_guidance, + negative_prompt,) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + batched_prompt_embd = torch.cat([prompt_embs], dim=0) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + latent_noise = latent_noise * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with self.progress_bar(total=num_inference_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 + ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None +> print("shapehape",cached_pos_hs[0].shape) +E TypeError: 'NoneType' object is not subscriptable + +../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError +_____________________ FabricPipelineFastTests.test_fabric ______________________ + +self = + + def test_fabric(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + + components = self.get_dummy_components() + pipe = FabricPipeline(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=True) + + inputs = self.get_dummy_inputs(device) +> output = pipe(**inputs) + +test_fabric.py:135: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context + return func(*args, **kwargs) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = FabricPipeline { + "_class_name": "FabricPipeline", + "_diffusers_version": "0.21.0.dev0", + "scheduler": [ + "diffu... ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} + +prompt = 'A painting of a squirrel eating a burger' +negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] +generator = , height = 128 +width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 +num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 +feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 +pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = "", + negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", + liked: Optional[Union[List[str], List[Image.Image]]] = [], + disliked: Optional[Union[List[str], List[Image.Image]]] = [], + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + height: int = 512, + width: int = 512, + return_dict: bool = True, + num_images: int = 4, + guidance_scale: float = 7.0, + num_inference_steps: int = 20, + output_type: Optional[str] = "pil", + feedback_start_ratio: float = 0.33, + feedback_end_ratio: float = 0.66, + min_weight: float = 0.05, + max_weight: float = 0.8, + neg_scale: float = 0.5, + pos_bottleneck_scale: float = 1.0, + neg_bottleneck_scale: float = 1.0, + latents: Optional[torch.FloatTensor] = None, + ): + r""" + The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The + feedback can be given as a list of liked and disliked images. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + liked (`List[Image.Image]` or `List[str]`, *optional*): + Encourages images with liked features. + disliked (`List[Image.Image]` or `List[str]`, *optional*): + Discourages images with disliked features. + generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to + make generation deterministic. + height (`int`, *optional*, defaults to 512): + Height of the generated image + width (`int`, *optional*, defaults to 512): + Width of the generated image + num_images (`int`, *optional*, defaults to 4): + The number of images to generate per prompt. + guidance_scale (`float`, *optional*, defaults to 7.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + num_inference_steps (`int`, *optional*, defaults to 20): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + feedback_start_ratio (`float`, *optional*, defaults to `.33`): + Start point for providing feedback (between 0 and 1). + feedback_end_ratio (`float`, *optional*, defaults to `.66`): + End point for providing feedback (between 0 and 1). + min_weight (`float`, *optional*, defaults to `.05`): + Minimum weight for feedback. + max_weight (`float`, *optional*, defults tp `1.0`): + Maximum weight for feedback. + neg_scale (`float`, *optional*, defaults to `.5`): + Scale factor for negative feedback. + + Examples: + + Returns: + [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` + is returned where the first element is a list with the generated images and the second element is a + list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" + (nsfw) content. + + """ + + self.check_inputs(prompt, negative_prompt, liked, disliked) + + device = self._execution_device + dtype = self.unet.dtype + + if isinstance(prompt, str) and prompt is not None: + batch_size = 1 + elif isinstance(prompt, list) and prompt is not None: + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif isinstance(negative_prompt, list): + negative_prompt = negative_prompt + else: + assert len(negative_prompt) == batch_size + + shape = ( + batch_size * num_images, + self.unet.config.in_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + latent_noise = randn_tensor( + shape, + device=device, + dtype=dtype, + generator=generator, + ) + + positive_latents = ( + self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) + if liked and len(liked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + negative_latents = ( + self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) + if disliked and len(disliked) > 0 + else torch.tensor( + [], + device=device, + dtype=dtype, + ) + ) + + do_classifier_free_guidance = guidance_scale > 0.1 + + prompt_embs = self._encode_prompt( + prompt, + device, + num_images, + do_classifier_free_guidance, + negative_prompt,) + + null_tokens = self.tokenizer( + [""], + return_tensors="pt", + max_length=self.tokenizer.model_max_length, + padding="max_length", + truncation=True, + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = prompt_tokens.attention_mask.to(self.device) + else: + attention_mask = None + + null_prompt_emb = self.text_encoder( + input_ids=null_tokens.input_ids.to(self.device), + attention_mask=attention_mask, + ).last_hidden_state + + batched_prompt_embd = torch.cat([prompt_embs], dim=0) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + latent_noise = latent_noise * self.scheduler.init_noise_sigma + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + ref_start_idx = round(len(timesteps) * feedback_start_ratio) + ref_end_idx = round(len(timesteps) * feedback_end_ratio) + + with self.progress_bar(total=num_inference_steps) as pbar: + for i, t in enumerate(timesteps): + sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 + if hasattr(self.scheduler, "sigmas"): + sigma = self.scheduler.sigmas[i] + + alpha_hat = 1 / (sigma**2 + 1) + + z_single = self.scheduler.scale_model_input(latent_noise, t) + z_all = torch.cat([z_single] * 2, dim=0) + z_ref = torch.cat([positive_latents, negative_latents], dim=0) + + if i >= ref_start_idx and i <= ref_end_idx: + weight_factor = max_weight + else: + weight_factor = min_weight + + pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) + neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) + + if z_ref.size(0) > 0 and weight_factor > 0: + noise = torch.randn_like(z_ref) + if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): + z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) + else: + z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) + + ref_prompt_embd = torch.cat( + [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 + ) + cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) + + n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] + cached_pos_hs, cached_neg_hs = [], [] + for hs in cached_hidden_states: + cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) + cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) + cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) + cached_pos_hs.append(cached_pos) + cached_neg_hs.append(cached_neg) + + if n_pos == 0: + cached_pos_hs = None + if n_neg == 0: + cached_neg_hs = None + else: + cached_pos_hs, cached_neg_hs = None, None +> print("shapehape",cached_pos_hs[0].shape) +E TypeError: 'NoneType' object is not subscriptable + +../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError +=============================== warnings summary =============================== +../../../../../.local/lib/python3.10/site-packages/accelerate/utils/dataclasses.py:29 + /home/fractal/.local/lib/python3.10/site-packages/accelerate/utils/dataclasses.py:29: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives + from distutils.util import strtobool + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info ============================ +FAILED test_fabric.py::FabricPipelineFastTests::test_attention_slicing_forward_pass +FAILED test_fabric.py::FabricPipelineFastTests::test_cfg - TypeError: 'NoneTy... +FAILED test_fabric.py::FabricPipelineFastTests::test_cpu_offload_forward_pass +FAILED test_fabric.py::FabricPipelineFastTests::test_dict_tuple_outputs_equivalent +FAILED test_fabric.py::FabricPipelineFastTests::test_fabric - TypeError: 'Non... +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! KeyboardInterrupt !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +/home/fractal/.local/lib/python3.10/site-packages/urllib3/connection.py:174: KeyboardInterrupt +(to show a full traceback on KeyboardInterrupt use --full-trace) +============== 5 failed, 1 passed, 1 warning in 78.87s (0:01:18) =============== diff --git a/tests/pipelines/stable_diffusion/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py index 6b3bc798ec4d..7d0b94c81dd0 100644 --- a/tests/pipelines/stable_diffusion/test_fabric.py +++ b/tests/pipelines/stable_diffusion/test_fabric.py @@ -168,6 +168,32 @@ def test_fabric_w_fb(self): self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) + def test_monkey_patching(self): + # Create a sample model and module + device = "cpu" + torch.manual_seed(0) + z_all = torch.randn(2, 4, 64, 64) + t = 0 + prompt_embd = torch.randn(2, 77, 32) + cached_pos_hiddens = [torch.randn(1, 1024, 64)] * 6 + print(len(cached_pos_hiddens)) + self.get_dummy_inputs(device) + print(cached_pos_hiddens[0].shape) + # out = model.unet(z_all, t, encoder_hidden_states=prompt_embd) + components = self.get_dummy_components() + pipe = FabricPipeline(**components) + pipe = pipe.to(device) + pipeline = pipe.unet_forward_with_cached_hidden_states( + z_all, t, prompt_embd, cached_pos_hiddens=cached_pos_hiddens + )[0] + + image_slice = pipeline[0, -3:, -3:, -1].detach().numpy() + expected_slice = np.array( + [[-0.0590, 0.3149, -0.1035], [0.0016, -0.1665, 0.1026], [-0.0626, 0.0607, -0.1045]] + ).flatten() + + self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) + @nightly @require_torch_gpu From a75c41291d17c32123c2672450ffa17dad860c86 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Wed, 23 Aug 2023 23:18:15 +0530 Subject: [PATCH 91/98] more docs --- docs/source/en/api/pipelines/fabric.md | 4 ++-- .../stable_diffusion/pipeline_fabric.py | 21 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index 9478a9bcc720..99ddf1b68c5f 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # FABRIC -[FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf +[FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf. FABRIC is a training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models. It is created by researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] can generate photo-realistic images given any text input using Stable Diffusion. @@ -20,7 +20,7 @@ The abstract from the paper is: *In an era where visual content generation is increasingly driven by machine learning, the integration of human feedback into generative models presents significant opportunities for enhancing user experience and output quality. This study explores strategies for incorporating iterative human feedback into the generative process of diffusion-based text-to-image models. We propose FABRIC, a training-free approach applicable to a wide range of popular diffusion models, which exploits the self-attention layer present in the most widely used architectures to condition the diffusion process on a set of feedback images. To ensure a rigorous assessment of our approach, we introduce a comprehensive evaluation methodology, offering a robust mechanism to quantify the performance of generative visual models that integrate human feedback. We show that generation results improve over multiple rounds of iterative feedback through exhaustive analysis, implicitly optimizing arbitrary user preferences. The potential applications of these findings extend to fields such as personalized content creation and customization* -The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results) +The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results). Let's have a look at the images (*512X512*) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py index 535f403137ca..a78d38852a3d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py @@ -47,8 +47,8 @@ >>> pipe = FabricPipeline(model_id, torch_dtype=torch.float16) >>> pipe = pipe.to("cuda") >>> prompt = "a giant standing in a fantasy landscape best quality" - >>> liked = [] - >>> disliked = [] + >>> liked = [] # list of images for positive feedback + >>> disliked = [] # list of images for negative feedback >>> image = pipe(prompt, num_images=4, liked=liked, disliked=disliked).images[0] ``` """ @@ -523,9 +523,8 @@ def __call__( The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` instead. negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). liked (`List[Image.Image]` or `List[str]`, *optional*): Encourages images with liked features. disliked (`List[Image.Image]` or `List[str]`, *optional*): @@ -534,9 +533,9 @@ def __call__( A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to make generation deterministic. height (`int`, *optional*, defaults to 512): - Height of the generated image + Height of the generated image. width (`int`, *optional*, defaults to 512): - Width of the generated image + Width of the generated image. num_images (`int`, *optional*, defaults to 4): The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 7.0): @@ -565,10 +564,10 @@ def __call__( Returns: [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ From 40df49aa000509178e6474542ea6804df87481cf Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 25 Aug 2023 11:10:04 +0530 Subject: [PATCH 92/98] changes --- docs/source/en/api/pipelines/fabric.md | 2 +- .../using-diffusers/controlling_generation.md | 2 +- tests/pipelines/stable_diffusion/ff | 1377 ----------------- .../pipelines/stable_diffusion/test_fabric.py | 47 - 4 files changed, 2 insertions(+), 1426 deletions(-) delete mode 100644 tests/pipelines/stable_diffusion/ff diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md index 99ddf1b68c5f..54c8853f4b07 100644 --- a/docs/source/en/api/pipelines/fabric.md +++ b/docs/source/en/api/pipelines/fabric.md @@ -26,7 +26,7 @@ Let's have a look at the images (*512X512*) | Without Feedback | With Feedback (1st image) | |---------------------|---------------------| -| ![Image 1](https://drive.google.com/uc?export=view&id=12wxbikt7834eRTK40legR5PtJmFLNH34) | ![Feedback Image 1](https://drive.google.com/uc?export=view&id=1YcFPDHSRr2OE3hy-5lvr8An21Jum85D5) | +| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | diff --git a/docs/source/en/using-diffusers/controlling_generation.md b/docs/source/en/using-diffusers/controlling_generation.md index b2dded7826c9..25e4b0d699d3 100644 --- a/docs/source/en/using-diffusers/controlling_generation.md +++ b/docs/source/en/using-diffusers/controlling_generation.md @@ -62,7 +62,7 @@ For convenience, we provide a table to denote which methods are inference-only a | [Model Editing](#model-editing) | ✅ | ❌ | | | [DiffEdit](#diffedit) | ✅ | ❌ | | | [T2I-Adapter](#t2i-adapter) | ✅ | ❌ | | -| [Fabric](#fabric) | ✅ | ❌ | | +| [Fabric](#fabric) | ✅ | ❌ | | ## Instruct Pix2Pix [Paper](https://arxiv.org/abs/2211.09800) diff --git a/tests/pipelines/stable_diffusion/ff b/tests/pipelines/stable_diffusion/ff deleted file mode 100644 index 90517feb020b..000000000000 --- a/tests/pipelines/stable_diffusion/ff +++ /dev/null @@ -1,1377 +0,0 @@ -============================= test session starts ============================== -platform linux -- Python 3.10.6, pytest-7.4.0, pluggy-1.2.0 -rootdir: /home/fractal/files/diffusers -plugins: hydra-core-1.3.2, xdist-3.3.1, anyio-3.7.1 -collected 22 items - -test_fabric.py FF.FFF - -=================================== FAILURES =================================== -_________ FabricPipelineFastTests.test_attention_slicing_forward_pass __________ - -self = -expected_max_diff = 0.001 - - def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3): -> self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff) - -../test_pipelines_common.py:650: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -../test_pipelines_common.py:664: in _test_attention_slicing_forward_pass - output_without_slicing = pipe(**inputs)[0] -/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context - return func(*args, **kwargs) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = FabricPipeline { - "_class_name": "FabricPipeline", - "_diffusers_version": "0.21.0.dev0", - "scheduler": [ - "diffu... ], - "unet": [ - "diffusers", - "UNet2DConditionModel" - ], - "vae": [ - "diffusers", - "AutoencoderKL" - ] -} - -prompt = 'A painting of a squirrel eating a burger' -negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] -generator = , height = 128 -width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 -num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 -feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 -pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None - - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - height: int = 512, - width: int = 512, - return_dict: bool = True, - num_images: int = 4, - guidance_scale: float = 7.0, - num_inference_steps: int = 20, - output_type: Optional[str] = "pil", - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = 0.8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, - ): - r""" - The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The - feedback can be given as a list of liked and disliked images. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Discourages images with disliked features. - generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to - make generation deterministic. - height (`int`, *optional*, defaults to 512): - Height of the generated image - width (`int`, *optional*, defaults to 512): - Width of the generated image - num_images (`int`, *optional*, defaults to 4): - The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - num_inference_steps (`int`, *optional*, defaults to 20): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - feedback_start_ratio (`float`, *optional*, defaults to `.33`): - Start point for providing feedback (between 0 and 1). - feedback_end_ratio (`float`, *optional*, defaults to `.66`): - End point for providing feedback (between 0 and 1). - min_weight (`float`, *optional*, defaults to `.05`): - Minimum weight for feedback. - max_weight (`float`, *optional*, defults tp `1.0`): - Maximum weight for feedback. - neg_scale (`float`, *optional*, defaults to `.5`): - Scale factor for negative feedback. - - Examples: - - Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. - - """ - - self.check_inputs(prompt, negative_prompt, liked, disliked) - - device = self._execution_device - dtype = self.unet.dtype - - if isinstance(prompt, str) and prompt is not None: - batch_size = 1 - elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) - else: - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == batch_size - - shape = ( - batch_size * num_images, - self.unet.config.in_channels, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latent_noise = randn_tensor( - shape, - device=device, - dtype=dtype, - generator=generator, - ) - - positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) - if liked and len(liked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) - if disliked and len(disliked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - - do_classifier_free_guidance = guidance_scale > 0.1 - - prompt_embs = self._encode_prompt( - prompt, - device, - num_images, - do_classifier_free_guidance, - negative_prompt,) - - null_tokens = self.tokenizer( - [""], - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - null_prompt_emb = self.text_encoder( - input_ids=null_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - batched_prompt_embd = torch.cat([prompt_embs], dim=0) - - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) - - with self.progress_bar(total=num_inference_steps) as pbar: - for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat( - [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 - ) - cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) - cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None -> print("shapehape",cached_pos_hs[0].shape) -E TypeError: 'NoneType' object is not subscriptable - -../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError -_______________________ FabricPipelineFastTests.test_cfg _______________________ - -self = - - def test_cfg(self): - sig = inspect.signature(self.pipeline_class.__call__) - - if "guidance_scale" not in sig.parameters: - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - - inputs["guidance_scale"] = 1.0 -> out_no_cfg = pipe(**inputs)[0] - -../test_pipelines_common.py:797: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context - return func(*args, **kwargs) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = FabricPipeline { - "_class_name": "FabricPipeline", - "_diffusers_version": "0.21.0.dev0", - "scheduler": [ - "diffu... ], - "unet": [ - "diffusers", - "UNet2DConditionModel" - ], - "vae": [ - "diffusers", - "AutoencoderKL" - ] -} - -prompt = 'A painting of a squirrel eating a burger' -negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] -generator = , height = 128 -width = 128, return_dict = True, num_images = 1, guidance_scale = 1.0 -num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 -feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 -pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None - - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - height: int = 512, - width: int = 512, - return_dict: bool = True, - num_images: int = 4, - guidance_scale: float = 7.0, - num_inference_steps: int = 20, - output_type: Optional[str] = "pil", - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = 0.8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, - ): - r""" - The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The - feedback can be given as a list of liked and disliked images. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Discourages images with disliked features. - generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to - make generation deterministic. - height (`int`, *optional*, defaults to 512): - Height of the generated image - width (`int`, *optional*, defaults to 512): - Width of the generated image - num_images (`int`, *optional*, defaults to 4): - The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - num_inference_steps (`int`, *optional*, defaults to 20): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - feedback_start_ratio (`float`, *optional*, defaults to `.33`): - Start point for providing feedback (between 0 and 1). - feedback_end_ratio (`float`, *optional*, defaults to `.66`): - End point for providing feedback (between 0 and 1). - min_weight (`float`, *optional*, defaults to `.05`): - Minimum weight for feedback. - max_weight (`float`, *optional*, defults tp `1.0`): - Maximum weight for feedback. - neg_scale (`float`, *optional*, defaults to `.5`): - Scale factor for negative feedback. - - Examples: - - Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. - - """ - - self.check_inputs(prompt, negative_prompt, liked, disliked) - - device = self._execution_device - dtype = self.unet.dtype - - if isinstance(prompt, str) and prompt is not None: - batch_size = 1 - elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) - else: - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == batch_size - - shape = ( - batch_size * num_images, - self.unet.config.in_channels, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latent_noise = randn_tensor( - shape, - device=device, - dtype=dtype, - generator=generator, - ) - - positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) - if liked and len(liked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) - if disliked and len(disliked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - - do_classifier_free_guidance = guidance_scale > 0.1 - - prompt_embs = self._encode_prompt( - prompt, - device, - num_images, - do_classifier_free_guidance, - negative_prompt,) - - null_tokens = self.tokenizer( - [""], - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - null_prompt_emb = self.text_encoder( - input_ids=null_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - batched_prompt_embd = torch.cat([prompt_embs], dim=0) - - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) - - with self.progress_bar(total=num_inference_steps) as pbar: - for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat( - [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 - ) - cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) - cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None -> print("shapehape",cached_pos_hs[0].shape) -E TypeError: 'NoneType' object is not subscriptable - -../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError -____________ FabricPipelineFastTests.test_cpu_offload_forward_pass _____________ - -self = -expected_max_diff = 0.0001 - - @unittest.skipIf( - torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), - reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", - ) - def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) -> output_without_offload = pipe(**inputs)[0] - -../test_pipelines_common.py:688: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context - return func(*args, **kwargs) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = FabricPipeline { - "_class_name": "FabricPipeline", - "_diffusers_version": "0.21.0.dev0", - "scheduler": [ - "diffu... ], - "unet": [ - "diffusers", - "UNet2DConditionModel" - ], - "vae": [ - "diffusers", - "AutoencoderKL" - ] -} - -prompt = 'A painting of a squirrel eating a burger' -negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] -generator = , height = 128 -width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 -num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 -feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 -pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None - - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - height: int = 512, - width: int = 512, - return_dict: bool = True, - num_images: int = 4, - guidance_scale: float = 7.0, - num_inference_steps: int = 20, - output_type: Optional[str] = "pil", - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = 0.8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, - ): - r""" - The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The - feedback can be given as a list of liked and disliked images. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Discourages images with disliked features. - generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to - make generation deterministic. - height (`int`, *optional*, defaults to 512): - Height of the generated image - width (`int`, *optional*, defaults to 512): - Width of the generated image - num_images (`int`, *optional*, defaults to 4): - The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - num_inference_steps (`int`, *optional*, defaults to 20): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - feedback_start_ratio (`float`, *optional*, defaults to `.33`): - Start point for providing feedback (between 0 and 1). - feedback_end_ratio (`float`, *optional*, defaults to `.66`): - End point for providing feedback (between 0 and 1). - min_weight (`float`, *optional*, defaults to `.05`): - Minimum weight for feedback. - max_weight (`float`, *optional*, defults tp `1.0`): - Maximum weight for feedback. - neg_scale (`float`, *optional*, defaults to `.5`): - Scale factor for negative feedback. - - Examples: - - Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. - - """ - - self.check_inputs(prompt, negative_prompt, liked, disliked) - - device = self._execution_device - dtype = self.unet.dtype - - if isinstance(prompt, str) and prompt is not None: - batch_size = 1 - elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) - else: - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == batch_size - - shape = ( - batch_size * num_images, - self.unet.config.in_channels, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latent_noise = randn_tensor( - shape, - device=device, - dtype=dtype, - generator=generator, - ) - - positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) - if liked and len(liked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) - if disliked and len(disliked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - - do_classifier_free_guidance = guidance_scale > 0.1 - - prompt_embs = self._encode_prompt( - prompt, - device, - num_images, - do_classifier_free_guidance, - negative_prompt,) - - null_tokens = self.tokenizer( - [""], - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - null_prompt_emb = self.text_encoder( - input_ids=null_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - batched_prompt_embd = torch.cat([prompt_embs], dim=0) - - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) - - with self.progress_bar(total=num_inference_steps) as pbar: - for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat( - [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 - ) - cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) - cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None -> print("shapehape",cached_pos_hs[0].shape) -E TypeError: 'NoneType' object is not subscriptable - -../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError -__________ FabricPipelineFastTests.test_dict_tuple_outputs_equivalent __________ - -self = -expected_max_difference = 0.0001 - - def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - -> output = pipe(**self.get_dummy_inputs(torch_device))[0] - -../test_pipelines_common.py:518: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context - return func(*args, **kwargs) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = FabricPipeline { - "_class_name": "FabricPipeline", - "_diffusers_version": "0.21.0.dev0", - "scheduler": [ - "diffu... ], - "unet": [ - "diffusers", - "UNet2DConditionModel" - ], - "vae": [ - "diffusers", - "AutoencoderKL" - ] -} - -prompt = 'A painting of a squirrel eating a burger' -negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] -generator = , height = 128 -width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 -num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 -feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 -pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None - - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - height: int = 512, - width: int = 512, - return_dict: bool = True, - num_images: int = 4, - guidance_scale: float = 7.0, - num_inference_steps: int = 20, - output_type: Optional[str] = "pil", - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = 0.8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, - ): - r""" - The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The - feedback can be given as a list of liked and disliked images. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Discourages images with disliked features. - generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to - make generation deterministic. - height (`int`, *optional*, defaults to 512): - Height of the generated image - width (`int`, *optional*, defaults to 512): - Width of the generated image - num_images (`int`, *optional*, defaults to 4): - The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - num_inference_steps (`int`, *optional*, defaults to 20): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - feedback_start_ratio (`float`, *optional*, defaults to `.33`): - Start point for providing feedback (between 0 and 1). - feedback_end_ratio (`float`, *optional*, defaults to `.66`): - End point for providing feedback (between 0 and 1). - min_weight (`float`, *optional*, defaults to `.05`): - Minimum weight for feedback. - max_weight (`float`, *optional*, defults tp `1.0`): - Maximum weight for feedback. - neg_scale (`float`, *optional*, defaults to `.5`): - Scale factor for negative feedback. - - Examples: - - Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. - - """ - - self.check_inputs(prompt, negative_prompt, liked, disliked) - - device = self._execution_device - dtype = self.unet.dtype - - if isinstance(prompt, str) and prompt is not None: - batch_size = 1 - elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) - else: - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == batch_size - - shape = ( - batch_size * num_images, - self.unet.config.in_channels, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latent_noise = randn_tensor( - shape, - device=device, - dtype=dtype, - generator=generator, - ) - - positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) - if liked and len(liked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) - if disliked and len(disliked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - - do_classifier_free_guidance = guidance_scale > 0.1 - - prompt_embs = self._encode_prompt( - prompt, - device, - num_images, - do_classifier_free_guidance, - negative_prompt,) - - null_tokens = self.tokenizer( - [""], - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - null_prompt_emb = self.text_encoder( - input_ids=null_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - batched_prompt_embd = torch.cat([prompt_embs], dim=0) - - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) - - with self.progress_bar(total=num_inference_steps) as pbar: - for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat( - [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 - ) - cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) - cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None -> print("shapehape",cached_pos_hs[0].shape) -E TypeError: 'NoneType' object is not subscriptable - -../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError -_____________________ FabricPipelineFastTests.test_fabric ______________________ - -self = - - def test_fabric(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = FabricPipeline(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=True) - - inputs = self.get_dummy_inputs(device) -> output = pipe(**inputs) - -test_fabric.py:135: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -/home/fractal/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context - return func(*args, **kwargs) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = FabricPipeline { - "_class_name": "FabricPipeline", - "_diffusers_version": "0.21.0.dev0", - "scheduler": [ - "diffu... ], - "unet": [ - "diffusers", - "UNet2DConditionModel" - ], - "vae": [ - "diffusers", - "AutoencoderKL" - ] -} - -prompt = 'A painting of a squirrel eating a burger' -negative_prompt = ['lowres, dark, cropped'], liked = [], disliked = [] -generator = , height = 128 -width = 128, return_dict = True, num_images = 1, guidance_scale = 7.0 -num_inference_steps = 2, output_type = 'np', feedback_start_ratio = 0.33 -feedback_end_ratio = 0.66, min_weight = 0.05, max_weight = 0.8, neg_scale = 0.5 -pos_bottleneck_scale = 1.0, neg_bottleneck_scale = 1.0, latents = None - - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = "", - negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality", - liked: Optional[Union[List[str], List[Image.Image]]] = [], - disliked: Optional[Union[List[str], List[Image.Image]]] = [], - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - height: int = 512, - width: int = 512, - return_dict: bool = True, - num_images: int = 4, - guidance_scale: float = 7.0, - num_inference_steps: int = 20, - output_type: Optional[str] = "pil", - feedback_start_ratio: float = 0.33, - feedback_end_ratio: float = 0.66, - min_weight: float = 0.05, - max_weight: float = 0.8, - neg_scale: float = 0.5, - pos_bottleneck_scale: float = 1.0, - neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, - ): - r""" - The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The - feedback can be given as a list of liked and disliked images. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds` - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - liked (`List[Image.Image]` or `List[str]`, *optional*): - Encourages images with liked features. - disliked (`List[Image.Image]` or `List[str]`, *optional*): - Discourages images with disliked features. - generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to - make generation deterministic. - height (`int`, *optional*, defaults to 512): - Height of the generated image - width (`int`, *optional*, defaults to 512): - Width of the generated image - num_images (`int`, *optional*, defaults to 4): - The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 7.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - num_inference_steps (`int`, *optional*, defaults to 20): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - feedback_start_ratio (`float`, *optional*, defaults to `.33`): - Start point for providing feedback (between 0 and 1). - feedback_end_ratio (`float`, *optional*, defaults to `.66`): - End point for providing feedback (between 0 and 1). - min_weight (`float`, *optional*, defaults to `.05`): - Minimum weight for feedback. - max_weight (`float`, *optional*, defults tp `1.0`): - Maximum weight for feedback. - neg_scale (`float`, *optional*, defaults to `.5`): - Scale factor for negative feedback. - - Examples: - - Returns: - [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.fabric.FabricPipelineOutput`] is returned, otherwise a `tuple` - is returned where the first element is a list with the generated images and the second element is a - list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" - (nsfw) content. - - """ - - self.check_inputs(prompt, negative_prompt, liked, disliked) - - device = self._execution_device - dtype = self.unet.dtype - - if isinstance(prompt, str) and prompt is not None: - batch_size = 1 - elif isinstance(prompt, list) and prompt is not None: - batch_size = len(prompt) - else: - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if isinstance(negative_prompt, str): - negative_prompt = [negative_prompt] - elif isinstance(negative_prompt, list): - negative_prompt = negative_prompt - else: - assert len(negative_prompt) == batch_size - - shape = ( - batch_size * num_images, - self.unet.config.in_channels, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latent_noise = randn_tensor( - shape, - device=device, - dtype=dtype, - generator=generator, - ) - - positive_latents = ( - self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator) - if liked and len(liked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - negative_latents = ( - self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator) - if disliked and len(disliked) > 0 - else torch.tensor( - [], - device=device, - dtype=dtype, - ) - ) - - do_classifier_free_guidance = guidance_scale > 0.1 - - prompt_embs = self._encode_prompt( - prompt, - device, - num_images, - do_classifier_free_guidance, - negative_prompt,) - - null_tokens = self.tokenizer( - [""], - return_tensors="pt", - max_length=self.tokenizer.model_max_length, - padding="max_length", - truncation=True, - ) - - if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask - ): - attention_mask = prompt_tokens.attention_mask.to(self.device) - else: - attention_mask = None - - null_prompt_emb = self.text_encoder( - input_ids=null_tokens.input_ids.to(self.device), - attention_mask=attention_mask, - ).last_hidden_state - - batched_prompt_embd = torch.cat([prompt_embs], dim=0) - - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - latent_noise = latent_noise * self.scheduler.init_noise_sigma - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - - ref_start_idx = round(len(timesteps) * feedback_start_ratio) - ref_end_idx = round(len(timesteps) * feedback_end_ratio) - - with self.progress_bar(total=num_inference_steps) as pbar: - for i, t in enumerate(timesteps): - sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0 - if hasattr(self.scheduler, "sigmas"): - sigma = self.scheduler.sigmas[i] - - alpha_hat = 1 / (sigma**2 + 1) - - z_single = self.scheduler.scale_model_input(latent_noise, t) - z_all = torch.cat([z_single] * 2, dim=0) - z_ref = torch.cat([positive_latents, negative_latents], dim=0) - - if i >= ref_start_idx and i <= ref_end_idx: - weight_factor = max_weight - else: - weight_factor = min_weight - - pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale) - neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale) - - if z_ref.size(0) > 0 and weight_factor > 0: - noise = torch.randn_like(z_ref) - if isinstance(self.scheduler, EulerAncestralDiscreteScheduler): - z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype) - else: - z_ref_noised = self.scheduler.add_noise(z_ref, noise, t) - - ref_prompt_embd = torch.cat( - [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0 - ) - cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd) - - n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0] - cached_pos_hs, cached_neg_hs = [], [] - for hs in cached_hidden_states: - cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0) - cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1) - cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1) - cached_pos_hs.append(cached_pos) - cached_neg_hs.append(cached_neg) - - if n_pos == 0: - cached_pos_hs = None - if n_neg == 0: - cached_neg_hs = None - else: - cached_pos_hs, cached_neg_hs = None, None -> print("shapehape",cached_pos_hs[0].shape) -E TypeError: 'NoneType' object is not subscriptable - -../../../src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py:717: TypeError -=============================== warnings summary =============================== -../../../../../.local/lib/python3.10/site-packages/accelerate/utils/dataclasses.py:29 - /home/fractal/.local/lib/python3.10/site-packages/accelerate/utils/dataclasses.py:29: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives - from distutils.util import strtobool - --- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html -=========================== short test summary info ============================ -FAILED test_fabric.py::FabricPipelineFastTests::test_attention_slicing_forward_pass -FAILED test_fabric.py::FabricPipelineFastTests::test_cfg - TypeError: 'NoneTy... -FAILED test_fabric.py::FabricPipelineFastTests::test_cpu_offload_forward_pass -FAILED test_fabric.py::FabricPipelineFastTests::test_dict_tuple_outputs_equivalent -FAILED test_fabric.py::FabricPipelineFastTests::test_fabric - TypeError: 'Non... -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! KeyboardInterrupt !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -/home/fractal/.local/lib/python3.10/site-packages/urllib3/connection.py:174: KeyboardInterrupt -(to show a full traceback on KeyboardInterrupt use --full-trace) -============== 5 failed, 1 passed, 1 warning in 78.87s (0:01:18) =============== diff --git a/tests/pipelines/stable_diffusion/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py index 7d0b94c81dd0..5ec05b918e54 100644 --- a/tests/pipelines/stable_diffusion/test_fabric.py +++ b/tests/pipelines/stable_diffusion/test_fabric.py @@ -176,9 +176,7 @@ def test_monkey_patching(self): t = 0 prompt_embd = torch.randn(2, 77, 32) cached_pos_hiddens = [torch.randn(1, 1024, 64)] * 6 - print(len(cached_pos_hiddens)) self.get_dummy_inputs(device) - print(cached_pos_hiddens[0].shape) # out = model.unet(z_all, t, encoder_hidden_states=prompt_embd) components = self.get_dummy_components() pipe = FabricPipeline(**components) @@ -194,48 +192,3 @@ def test_monkey_patching(self): self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) - -@nightly -@require_torch_gpu -class FABRICPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_fabric(self): - generator = torch.manual_seed(0) - - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16) - pipe.to("cuda") - - prompt = "a photograph of an astronaut riding a horse" - images = pipe(prompt, output_type="np", generator=generator, num_inference_steps=2).images - - images = images[0, -3:, -3:, -1].flatten() - - expected_image = np.array( - [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] - ) - - self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) - - def test_fabric_feedback(self): - generator = torch.manual_seed(0) - - pipe = FabricPipeline.from_pretrained("dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16) - pipe.to("cuda") - - prompt = "a photograph of an astronaut riding a horse" - images = pipe(prompt, output_type="pil", generator=generator, num_inference_steps=2).images - - liked = [images[0]] - images = pipe(prompt, output_type="np", generator=generator, num_inference_steps=2, liked=liked).images - - images = images[0, -3:, -3:, -1].flatten() - - expected_image = np.array( - [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] - ) - - self.assertTrue(np.allclose(images, expected_image, atol=1e-4)) From 8c3516bb04ec4abf578e8e8c6937739caa194f31 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Fri, 25 Aug 2023 11:30:09 +0530 Subject: [PATCH 93/98] new images --- tests/pipelines/stable_diffusion/test_fabric.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py index 5ec05b918e54..5a062899c24f 100644 --- a/tests/pipelines/stable_diffusion/test_fabric.py +++ b/tests/pipelines/stable_diffusion/test_fabric.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import unittest import numpy as np @@ -29,8 +28,6 @@ ) from diffusers.utils.testing_utils import ( enable_full_determinism, - nightly, - require_torch_gpu, ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS @@ -191,4 +188,3 @@ def test_monkey_patching(self): ).flatten() self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) - From 3a347677dc72d61bc171e1973c68b999b3940bbc Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 26 Aug 2023 23:35:13 +0530 Subject: [PATCH 94/98] to community examples --- examples/community/README.md | 65 ++++++ .../community}/pipeline_fabric.py | 24 +-- src/diffusers/__init__.py | 1 - src/diffusers/pipelines/__init__.py | 1 - .../pipelines/stable_diffusion/__init__.py | 1 - .../pipelines/stable_diffusion/test_fabric.py | 190 ------------------ 6 files changed, 77 insertions(+), 205 deletions(-) rename {src/diffusers/pipelines/stable_diffusion => examples/community}/pipeline_fabric.py (97%) delete mode 100644 tests/pipelines/stable_diffusion/test_fabric.py diff --git a/examples/community/README.md b/examples/community/README.md index 921160e451e1..9715324d9fa7 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -41,6 +41,7 @@ If a community doesn't work as expected, please open an issue and ping the autho | IADB Pipeline | Implementation of [Iterative α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486) | [IADB Pipeline](#iadb-pipeline) | - | [Thomas Chambon](https://github.com/tchambon) | Zero1to3 Pipeline | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328) | [Zero1to3 Pipeline](#Zero1to3-pipeline) | - | [Xin Kong](https://github.com/kxhit) | Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | - | [Andrew Zhu](https://xhinker.medium.com/) | +FABRIC - Stable Diffusion with feedback Pipeline | A pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipline](#stable-diffusion-fabric-pipeline) | - | [Shauray Singh](https://shauray8.github.io/about_shauray/) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -1889,3 +1890,67 @@ for obj in range(bs): ``` +### Stable diffusion fabric pipeline + +FABRIC approach applicable to a wide range of popular diffusion models, which exploits +the self-attention layer present in the most widely used architectures to condition +the diffusion process on a set of feedback images. + + +```python +import requests +import torch +from PIL import Image +from io import BytesIO + +from diffusers import Diffusionpipeline + +# load the pipeline +# make sure you're logged in with `huggingface-cli login` +model_id_or_path = "runwayml/stable-diffusion-v1-5" +#can also be used with dreamlike-art/dreamlike-photoreal-2.0 +pipe = FabricPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda") + +# let's specify a prompt +prompt = "An astronaut riding an elephant" +negative_prompt = "lowres, cropped" + +# call the pipeline +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + num_inference_steps=20, + generator=torch.manual_seed(12) +).images[0] + +image.save("horse_to_elephant.jpg") + +# let's try another example with feedback +url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png" +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") + +prompt = "photo, A blue colored car, fish eye" +liked = [init_image] +## same goes with disliked + +# call the pipeline +torch.manual_seed(0) +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + liked = liked, + num_inference_steps=20, +).images[0] + +image.save("black_to_blue.png") +``` + +The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results). + +Let's have a look at the images (*512X512*) + +| Without Feedback | With Feedback (1st image) | +|---------------------|---------------------| +| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | + diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py b/examples/community/pipeline_fabric.py similarity index 97% rename from src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py rename to examples/community/pipeline_fabric.py index a78d38852a3d..59fb996417a2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -18,21 +18,21 @@ from PIL import Image from transformers import CLIPTextModel, CLIPTokenizer -from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel -from ...models.attention import BasicTransformerBlock -from ...models.attention_processor import LoRAAttnProcessor -from ...schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers -from ...utils import ( +from diffusers.configuration_utils import FrozenDict +from diffusers.image_processor import VaeImageProcessor +from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers import AutoencoderKL, UNet2DConditionModel +from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.attention_processor import LoRAAttnProcessor +from diffusers.schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers +from diffusers.utils import ( deprecate, logging, randn_tensor, replace_example_docstring, ) -from ..pipeline_utils import DiffusionPipeline -from . import StableDiffusionPipelineOutput +from diffusion.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -40,11 +40,11 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> from diffusers import FabricPipeline + >>> from diffusers import DiffusionPipeline >>> import torch >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0" - >>> pipe = FabricPipeline(model_id, torch_dtype=torch.float16) + >>> pipe = DiffusionPipeline(model_id, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric") >>> pipe = pipe.to("cuda") >>> prompt = "a giant standing in a fantasy landscape best quality" >>> liked = [] # list of images for positive feedback diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index d3716c5d932a..7780976d4d9f 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -135,7 +135,6 @@ AltDiffusionPipeline, AudioLDMPipeline, CycleDiffusionPipeline, - FabricPipeline, IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 29678450084e..16153951126a 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -87,7 +87,6 @@ from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline from .stable_diffusion import ( CycleDiffusionPipeline, - FabricPipeline, StableDiffusionAttendAndExcitePipeline, StableDiffusionDepth2ImgPipeline, StableDiffusionDiffEditPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 81717f24ef93..1cef019e06a9 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -43,7 +43,6 @@ class StableDiffusionPipelineOutput(BaseOutput): from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: from .pipeline_cycle_diffusion import CycleDiffusionPipeline - from .pipeline_fabric import FabricPipeline from .pipeline_stable_diffusion import StableDiffusionPipeline from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline diff --git a/tests/pipelines/stable_diffusion/test_fabric.py b/tests/pipelines/stable_diffusion/test_fabric.py deleted file mode 100644 index 5a062899c24f..000000000000 --- a/tests/pipelines/stable_diffusion/test_fabric.py +++ /dev/null @@ -1,190 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - EulerAncestralDiscreteScheduler, - FabricPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - enable_full_determinism, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class FabricPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = FabricPipeline - params = TEXT_TO_IMAGE_PARAMS - { - "negative_prompt_embeds", - "prompt_embeds", - "cross_attention_kwargs", - "callback", - "callback_steps", - } - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents", - "num_images_per_prompt", - "callback", - "callback_steps", - } - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - torch.manual_seed(0) - scheduler = EulerAncestralDiscreteScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "negative_prompt": "lowres, dark, cropped", - "generator": generator, - "num_images": 1, - "num_inference_steps": 2, - "output_type": "np", - "height": 128, - "width": 128, - } - return inputs - - def test_fabric(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = FabricPipeline(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=True) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array( - [0.46241423, 0.45808375, 0.4768011, 0.48806447, 0.46090087, 0.5161956, 0.52250206, 0.50051796, 0.4663524] - ) - - self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) - - def test_fabric_w_fb(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = FabricPipeline(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=True) - - inputs = self.get_dummy_inputs(device) - inputs["liked"] = [Image.fromarray(np.ones((512, 512)))] - output = pipe(**inputs) - image = output.images - image_slice = output.images[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array( - [ - [0.46259943, 0.45826188, 0.4768875], - [0.4880805, 0.46087098, 0.5162324], - [0.5224824, 0.5005106, 0.46634308], - ] - ).flatten() - - self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) - - def test_monkey_patching(self): - # Create a sample model and module - device = "cpu" - torch.manual_seed(0) - z_all = torch.randn(2, 4, 64, 64) - t = 0 - prompt_embd = torch.randn(2, 77, 32) - cached_pos_hiddens = [torch.randn(1, 1024, 64)] * 6 - self.get_dummy_inputs(device) - # out = model.unet(z_all, t, encoder_hidden_states=prompt_embd) - components = self.get_dummy_components() - pipe = FabricPipeline(**components) - pipe = pipe.to(device) - pipeline = pipe.unet_forward_with_cached_hidden_states( - z_all, t, prompt_embd, cached_pos_hiddens=cached_pos_hiddens - )[0] - - image_slice = pipeline[0, -3:, -3:, -1].detach().numpy() - expected_slice = np.array( - [[-0.0590, 0.3149, -0.1035], [0.0016, -0.1665, 0.1026], [-0.0626, 0.0607, -0.1045]] - ).flatten() - - self.assertTrue(np.allclose(image_slice.flatten(), expected_slice, atol=1e-2)) From 4029b48f7248a4c5895858b23b26a11da2d5ead1 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 26 Aug 2023 23:35:50 +0530 Subject: [PATCH 95/98] selete --- examples/community/pipeline_fabric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index 59fb996417a2..5bdd3ef26f24 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -14,16 +14,18 @@ from typing import List, Optional, Union import torch +from diffusion.pipelines.pipeline_utils import DiffusionPipeline from packaging import version from PIL import Image from transformers import CLIPTextModel, CLIPTokenizer +from diffusers import AutoencoderKL, UNet2DConditionModel from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from diffusers import AutoencoderKL, UNet2DConditionModel from diffusers.models.attention import BasicTransformerBlock from diffusers.models.attention_processor import LoRAAttnProcessor +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers from diffusers.utils import ( deprecate, @@ -31,8 +33,6 @@ randn_tensor, replace_example_docstring, ) -from diffusion.pipelines.pipeline_utils import DiffusionPipeline -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name From 40c097191bf3db3d25db9a03f2daba136619990b Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 26 Aug 2023 23:49:05 +0530 Subject: [PATCH 96/98] more fixes --- .../pipelines/stable_diffusion/README.md | 51 ------------------- .../dummy_torch_and_transformers_objects.py | 15 ------ 2 files changed, 66 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md index 13d5a72dcb90..66df9a811afb 100644 --- a/src/diffusers/pipelines/stable_diffusion/README.md +++ b/src/diffusers/pipelines/stable_diffusion/README.md @@ -174,54 +174,3 @@ image = pipe( image.save("black_to_blue.png") ``` - -### FABRIC using Stable Diffusion - -```python -import requests -import torch -from PIL import Image -from io import BytesIO - -from diffusers import FabricPipeline - -# load the pipeline -# make sure you're logged in with `huggingface-cli login` -model_id_or_path = "runwayml/stable-diffusion-v1-5" -#can also be used with dreamlike-art/dreamlike-photoreal-2.0 -pipe = FabricPipeline.from_pretrained(model_id_or_path, ).to("cuda") - -# let's specify a prompt -prompt = "An astronaut riding an elephant" -negative_prompt = "lowres, cropped" - -# call the pipeline -image = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - num_inference_steps=20, - generator=torch.manual_seed(12) -).images[0] - -image.save("horse_to_elephant.jpg") - -# let's try another example with feedback -url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png" -response = requests.get(url) -init_image = Image.open(BytesIO(response.content)).convert("RGB") - -prompt = "photo, A blue colored car, fish eye" -liked = [init_image] -## same goes with disliked - -# call the pipeline -torch.manual_seed(0) -image = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - liked = liked, - num_inference_steps=20, -).images[0] - -image.save("black_to_blue.png") -``` diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index e17ee518fd82..719ea3e340f8 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -107,21 +107,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) -class FabricPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class IFImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] From f05becad4490a18ac8ea2b90aa947154186dce66 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 27 Aug 2023 23:08:55 +0530 Subject: [PATCH 97/98] changes --- docs/source/en/_toctree.yml | 2 -- docs/source/en/api/pipelines/fabric.md | 43 -------------------------- examples/community/README.md | 2 ++ examples/community/pipeline_fabric.py | 2 +- 4 files changed, 3 insertions(+), 46 deletions(-) delete mode 100644 docs/source/en/api/pipelines/fabric.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 5c8727e29e1f..b079504f2cf4 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -214,8 +214,6 @@ title: DiffEdit - local: api/pipelines/dit title: DiT - - local: api/pipelines/fabric - title: FABRIC - local: api/pipelines/pix2pix title: InstructPix2Pix - local: api/pipelines/kandinsky diff --git a/docs/source/en/api/pipelines/fabric.md b/docs/source/en/api/pipelines/fabric.md deleted file mode 100644 index 54c8853f4b07..000000000000 --- a/docs/source/en/api/pipelines/fabric.md +++ /dev/null @@ -1,43 +0,0 @@ - - -# FABRIC - -[FABRIC: Personalizing Diffusion Models with Iterative Feedback](https://huggingface.co/papers/2307.10159) (FABRIC) is by Dimitri von Rütte, Elisabetta Fedele, Jonathan Thomm and Lukas Wolf. - -FABRIC is a training-free approach that conditions the diffusion process on a set of feedback images, applicable to a wide range of popular diffusion models. It is created by researchers and engineers from [ETH Zürich, Switzerland](https://github.com/sd-fabric). The [`FabricPipeline`] can generate photo-realistic images given any text input using Stable Diffusion. - -The abstract from the paper is: - -*In an era where visual content generation is increasingly driven by machine learning, the integration of human feedback into generative models presents significant opportunities for enhancing user experience and output quality. This study explores strategies for incorporating iterative human feedback into the generative process of diffusion-based text-to-image models. We propose FABRIC, a training-free approach applicable to a wide range of popular diffusion models, which exploits the self-attention layer present in the most widely used architectures to condition the diffusion process on a set of feedback images. To ensure a rigorous assessment of our approach, we introduce a comprehensive evaluation methodology, offering a robust mechanism to quantify the performance of generative visual models that integrate human feedback. We show that generation results improve over multiple rounds of iterative feedback through exhaustive analysis, implicitly optimizing arbitrary user preferences. The potential applications of these findings extend to fields such as personalized content creation and customization* - -The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results). - -Let's have a look at the images (*512X512*) - -| Without Feedback | With Feedback (1st image) | -|---------------------|---------------------| -| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | - - - -Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently! - - - -[[autodoc]] FabricPipeline - - all - - __call__ - -## StableDiffusionPipelineOutput - -[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput diff --git a/examples/community/README.md b/examples/community/README.md index c3ef7a46ba06..c24ba98f658a 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -2013,6 +2013,8 @@ image = pipe( image.save("black_to_blue.png") ``` +*With enough feedbacks you can create very similar high quality images.* + The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results). Let's have a look at the images (*512X512*) diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index 5bdd3ef26f24..456e69cade13 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -14,7 +14,6 @@ from typing import List, Optional, Union import torch -from diffusion.pipelines.pipeline_utils import DiffusionPipeline from packaging import version from PIL import Image from transformers import CLIPTextModel, CLIPTokenizer @@ -25,6 +24,7 @@ from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models.attention import BasicTransformerBlock from diffusers.models.attention_processor import LoRAAttnProcessor +from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers from diffusers.utils import ( From aa96ed55a72e69bd39872cd0ab8e0ffaa3fce861 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sun, 27 Aug 2023 23:28:04 +0530 Subject: [PATCH 98/98] fix --- examples/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/community/README.md b/examples/community/README.md index c24ba98f658a..4f29383946b1 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1976,7 +1976,7 @@ from diffusers import Diffusionpipeline # make sure you're logged in with `huggingface-cli login` model_id_or_path = "runwayml/stable-diffusion-v1-5" #can also be used with dreamlike-art/dreamlike-photoreal-2.0 -pipe = FabricPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda") +pipe = DiffusionPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda") # let's specify a prompt prompt = "An astronaut riding an elephant"