From 9a70a3181f2b65a5d6e1bcffe3b5014a8a742eab Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 26 Sep 2023 09:22:14 -0700 Subject: [PATCH 1/7] start --- docs/source/en/using-diffusers/inpaint.md | 198 +++++++++++++--------- 1 file changed, 119 insertions(+), 79 deletions(-) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 7f10e43243a3..bf536c7d0943 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -10,115 +10,155 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Text-guided image-inpainting +# Inpainting [[open-in-colab]] -The [`StableDiffusionInpaintPipeline`] allows you to edit specific parts of an image by providing a mask and a text prompt. It uses a version of Stable Diffusion, like [`runwayml/stable-diffusion-inpainting`](https://huggingface.co/runwayml/stable-diffusion-inpainting) specifically trained for inpainting tasks. +Inpainting replaces or edits specified areas of an image. This makes it a useful tool for restoring old images, removing defects and artifacts, or even replacing areas of an image with something entirely new. Inpainting relies on a mask image to determine which regions of an image to fill in; the area to replace is represented by white pixels and the area to keep is represented by black pixels. The mask determines which areas containing the missing pixels to fill in with the prompt. -Get started by loading an instance of the [`StableDiffusionInpaintPipeline`]: +With 🤗 Diffusers, here is how you can quickly start inpainting: -```python -import PIL -import requests -import torch -from io import BytesIO +1. Load an inpainting checkpoint with the [`AutoPipelineForInpainting`] class. This'll automatically detect the appropriate pipeline class to load based on the checkpoint: -from diffusers import StableDiffusionInpaintPipeline +```py +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image +import torch -pipeline = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", - torch_dtype=torch.float16, - use_safetensors=True, - variant="fp16", -) -pipeline = pipeline.to("cuda") +pipeline = AutoPipelineForInpainting.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() ``` -Download an image and a mask of a dog which you'll eventually replace: + -```python -def download_image(url): - response = requests.get(url) - return PIL.Image.open(BytesIO(response.content)).convert("RGB") +You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](/optimization/torch2.0#scaled-dot-product-attention). + -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" +2. Load your base image and mask image: -init_image = download_image(img_url).resize((512, 512)) -mask_image = download_image(mask_url).resize((512, 512)) +```py +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") ``` -Now you can create a prompt to replace the mask with something else: +3. Create a prompt to inpaint the image with and pass it to the pipeline with the base and mask images: -```python -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] +```py +prompt = "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k" +negative_prompt = "bad anatomy, deformed, ugly, disfigured" +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask_image).images[0] +image ``` -`image` | `mask_image` | `prompt` | output | -:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:| -drawing | drawing | ***Face of a yellow cat, high resolution, sitting on a park bench*** | drawing | +
+
+ +
base image
+
+
+ +
generated image
+
+
+## Popular models - +[Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion XL (SDXL)](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images thanks to an image prior model that creates better embeddings. -A previous experimental implementation of inpainting used a different, lower-quality process. To ensure backwards compatibility, loading a pretrained pipeline that doesn't contain the new model will still apply the old inpainting method. +### Stable Diffusion v1.5 - +Stable Diffusion v1.5 is a latent diffusion model finetuned on 512x512 images. It is a good starting point for inpainting because it is relatively fast and produces images with good quality. To use this model for inpainting, you'll need to pass a prompt, base image, and mask image to the pipeline: -Check out the Spaces below to try out image inpainting yourself! +```py +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image +import torch + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() - +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +generator = torch.Generator("cuda").manual_seed(13) +prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] +image +``` -## Preserving the Unmasked Area of the Image +### Stable Diffusion XL (SDXL) -Generally speaking, [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) will change the unmasked part of the image as well. If this behavior is undesirable, you can force the unmasked area to remain the same as follows: +SDXL is a larger and more powerful version of Stable Diffusion v1.5. It employs a two-stage model process (though each model can also be used alone); the base model generates an image, and a refiner model takes that image and further enhances the details and quality of it. Take a look at the [SDXL](sdxl) guide for a more comprehensive guide on how to use SDXL and configure it's parameters. -```python -import PIL -import numpy as np +```py +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image import torch -from diffusers import StableDiffusionInpaintPipeline +pipeline = AutoPipelineForInpainting.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +generator = torch.Generator("cuda").manual_seed(13) +prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] +image +``` + +### Kandinsky 2.2 + +The Kandinsky model family is similar to SDXL in the sense that it uses two models; the image prior model generates image embeddings, and the diffusion model uses these embeddings to generate higher quality images. You can load the image prior and diffusion model separately, but the easiest way to use Kandinsky 2.2 is to load it into the [`AutoPipelineForInpainting`] class which uses the [`KandinskyV22InpaintCombinedPipeline`] under the hood. + +```py +from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image +import torch + +pipeline = AutoPipelineForInpainting.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") -device = "cuda" -pipeline = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", - torch_dtype=torch.float16, -) -pipeline = pipeline.to(device) - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" - -init_image = load_image(img_url).resize((512, 512)) -mask_image = load_image(mask_url).resize((512, 512)) - -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -repainted_image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] -repainted_image.save("repainted_image.png") - -# Convert mask to grayscale NumPy array -mask_image_arr = np.array(mask_image.convert("L")) -# Add a channel dimension to the end of the grayscale mask -mask_image_arr = mask_image_arr[:, :, None] -# Binarize the mask: 1s correspond to the pixels which are repainted -mask_image_arr = mask_image_arr.astype(np.float32) / 255.0 -mask_image_arr[mask_image_arr < 0.5] = 0 -mask_image_arr[mask_image_arr >= 0.5] = 1 - -# Take the masked pixels from the repainted image and the unmasked pixels from the initial image -unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image + mask_image_arr * repainted_image -unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8")) -unmasked_unchanged_image.save("force_unmasked_unchanged.png") +generator = torch.Generator("cuda").manual_seed(13) +prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] +image ``` -Forcing the unmasked portion of the image to remain the same might result in some weird transitions between the unmasked and masked areas, since the model will typically change the masked and unmasked areas to make the transition more natural. +
+
+ +
base image
+
+
+ +
Stable Diffusion v1.5
+
+
+ +
Stable Diffusion XL
+
+
+ +
Kandinsky 2.2
+
+
From 93c0aafeb6b283a2102c4595c2dae592f216a686 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 27 Sep 2023 13:35:09 -0700 Subject: [PATCH 2/7] finish draft --- docs/source/en/using-diffusers/img2img.md | 8 +- docs/source/en/using-diffusers/inpaint.md | 387 +++++++++++++++++++++- 2 files changed, 377 insertions(+), 18 deletions(-) diff --git a/docs/source/en/using-diffusers/img2img.md b/docs/source/en/using-diffusers/img2img.md index 82aa328d2b9c..c0bf4dc52153 100644 --- a/docs/source/en/using-diffusers/img2img.md +++ b/docs/source/en/using-diffusers/img2img.md @@ -33,7 +33,7 @@ pipeline.enable_xformers_memory_efficient_attention() -You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](/optimization/torch2.0#scaled-dot-product-attention). +You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention). @@ -590,17 +590,17 @@ image ## Optimize -Running diffusion models is computationally expensive and intensive, but with a few optimization tricks, it is entirely possible to run them on consumer and free-tier GPUs. For example, you can use a more memory-efficient form of attention such as PyTorch 2.0's [scaled-dot product attention](optimization/torch2.0#scaled-dot-product-attention) or [xFormers](optimization/xformers) (you can use one or the other, but there's no need to use both). You can also offload the model to the GPU while the other pipeline components wait on the CPU. +Running diffusion models is computationally expensive and intensive, but with a few optimization tricks, it is entirely possible to run them on consumer and free-tier GPUs. For example, you can use a more memory-efficient form of attention such as PyTorch 2.0's [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention) or [xFormers](../optimization/xformers) (you can use one or the other, but there's no need to use both). You can also offload the model to the GPU while the other pipeline components wait on the CPU. ```diff + pipeline.enable_model_cpu_offload() + pipeline.enable_xformers_memory_efficient_attention() ``` -With [`torch.compile`](optimization/torch2.0#torch.compile), you can boost your inference speed even more by wrapping your UNet with it: +With [`torch.compile`](../optimization/torch2.0#torch.compile), you can boost your inference speed even more by wrapping your UNet with it: ```py pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) ``` -To learn more, take a look at the [Reduce memory usage](optimization/memory) and [Torch 2.0](optimization/torch2.0) guides. +To learn more, take a look at the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides. diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index bf536c7d0943..7f390bd20d6b 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. [[open-in-colab]] -Inpainting replaces or edits specified areas of an image. This makes it a useful tool for restoring old images, removing defects and artifacts, or even replacing areas of an image with something entirely new. Inpainting relies on a mask image to determine which regions of an image to fill in; the area to replace is represented by white pixels and the area to keep is represented by black pixels. The mask determines which areas containing the missing pixels to fill in with the prompt. +Inpainting replaces or edits specified areas of an image. This makes it a useful tool for restoring old images, removing defects and artifacts, or even replacing areas of an image with something entirely new. Inpainting relies on a mask to determine which regions of an image to fill in; the area to replace is represented by white pixels and the area to keep is represented by black pixels. Areas containing the missing pixels are filled in by the prompt. With 🤗 Diffusers, here is how you can quickly start inpainting: @@ -34,11 +34,11 @@ pipeline.enable_xformers_memory_efficient_attention() -You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](/optimization/torch2.0#scaled-dot-product-attention). +You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention). -2. Load your base image and mask image: +2. Load the base and mask images: ```py init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") @@ -71,7 +71,7 @@ image ### Stable Diffusion v1.5 -Stable Diffusion v1.5 is a latent diffusion model finetuned on 512x512 images. It is a good starting point for inpainting because it is relatively fast and produces images with good quality. To use this model for inpainting, you'll need to pass a prompt, base image, and mask image to the pipeline: +Stable Diffusion v1.5 is a latent diffusion model finetuned on 512x512 images. It is a good starting point for inpainting because it is relatively fast and generates good quality images. To use this model for inpainting, you'll need to pass a prompt, base and mask image to the pipeline: ```py from diffusers import AutoPipelineForInpainting @@ -79,7 +79,7 @@ from diffusers.utils import load_image import torch pipeline = AutoPipelineForInpainting.from_pretrained( - "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" ).to("cuda") pipeline.enable_model_cpu_offload() pipeline.enable_xformers_memory_efficient_attention() @@ -88,10 +88,9 @@ pipeline.enable_xformers_memory_efficient_attention() init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") -generator = torch.Generator("cuda").manual_seed(13) -prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] -image ``` ### Stable Diffusion XL (SDXL) @@ -113,10 +112,9 @@ pipeline.enable_xformers_memory_efficient_attention() init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") -generator = torch.Generator("cuda").manual_seed(13) -prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] -image ``` ### Kandinsky 2.2 @@ -138,10 +136,9 @@ pipeline.enable_xformers_memory_efficient_attention() init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") -generator = torch.Generator("cuda").manual_seed(13) -prompt = "concept art digital painting of a fantasy castle, inspired by lord of the rings, deviantart, 8k" +generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] -image ```
@@ -162,3 +159,365 @@ image
Kandinsky 2.2
+ +## Configure pipeline parameters + +Image generation features - like quality and "creativity" - are dependent on pipeline parameters. Knowing what these parameters do is important for getting the results you want. Let's take a look at the most important parameters and see how changing them affects the generated image. + +## Strength + +`strength` is a measure of how much noise is added to the base image which means it'll affect how similar the generated image is to it. + +* 📈 a high `strength` value means more noise is added to an image and the denoising process takes longer, but you'll get higher quality images that are more different from the base image +* 📉 a low `strength` value means less noise is added to an image and the denoising process is faster, but the image quality may not be as great and the generated image resembles the base image more + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.6).images[0] +``` + +
+
+ +
strength = 0.6
+
+
+ +
strength = 0.8
+
+
+ +
strength = 1.0
+
+
+ +## Guidance scale + +`guidance_scale` affects how aligned the text prompt and generated image are. + +* 📈 a high `guidance_scale` value means the prompt and generated image are closely aligned so the output is a stricter interpretation of the prompt +* 📉 a low `guidance_scale` value means the prompt and generated image are more loosely aligned so the output may be more creative + +You can use `strength` and `guidance_scale` together for more granular control over how expressive the model is. For example, using high `strength` and `guidance_scale` values gives the model the most creative freedom. + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, guidance_scale=2.5).images[0] +``` + +
+
+ +
guidance_scale = 2.5
+
+
+ +
guidance_scale = 7.5
+
+
+ +
guidance_scale = 12.5
+
+
+ +### Negative prompt + +A negative prompt performs the opposite function of a prompt; it guides the model away from generating certain things in an image. This is useful for quickly improving image quality and preventing the model from generating things you don't want. + +```py +import torch +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +negative_prompt = "bad architecture, unstable, poor details, blurry" +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask_image).images[0] +image +``` + +
+ +
+ +## Chained inpainting pipelines + +[`AutoPipelineForInpainting`] can be chained with other 🤗 Diffusers pipelines to edit their outputs. + +### Text-to-image-to-inpaint + +Chaining a text-to-image and inpainting pipeline allows you to inpaint and edit the generated image without having to generate an entirely new one. + +Start by generating an image with the text-to-image pipeline: + +```py +import torch +from diffusers import AutoPipelineForText2Image, AutoPipelineForInpainting +from diffusers.utils import load_image + +pipeline = AutoPipelineForText2Image.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +image = pipeline("concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k").images[0] +``` + +Load the mask image of the output from above: + +```py +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_text-chain-mask.png").convert("RGB") +``` + +Let's replace the masked area with a waterfall: + +```py +pipeline = AutoPipelineForInpainting.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +prompt = "digital painting of a fantasy waterfall, cloudy" +image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0] +image +``` + +
+
+ +
text-to-image
+
+
+ +
inpaint
+
+
+ + +### Image-to-image-to-inpaint + +You can also chain an inpainting pipeline before or after an image-to-image pipeline. Depending on the position (first or last) of the inpainting pipeline in the chain, you can use it to make sure the image is ready for the image-to-image pipeline or you can use it to fix and modify the output from the image-to-image pipeline. This example uses the inpainting pipeline last. + +Begin by generating an image with the image-to-image pipeline: + +```py +import torch +from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting +from diffusers.utils import load_image + +pipeline = AutoPipelineForImage2Image.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# prepare image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" + +# pass prompt and image to pipeline +generator = torch.Generator("cuda").manual_seed(41) +image = pipeline(prompt, image=init_image, output_type="latent").images[0] +``` + + + +It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE. For example, in the [Text-to-image-to-inpaint](#text-to-image-to-inpaint) section, Kandinsky 2.2 uses a different VAE class than the Stable Diffusion model so it won't work. But if you use Stable Diffusion v1.5 for both pipelines, then you can keep everything in latent space because they both use [`AutoencoderKL`]. + + + +Let's inpaint the tree on the right side of the image with some mountains: + +```py +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/image-to-inpaint-chain-mask.png").convert("RGB") +prompt = "digital painting of clouds above mountains, snowy, fantasy, soft" + +generator = torch.Generator("cuda").manual_seed(48) +image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0] +``` + +
+
+ +
initial image
+
+
+ +
image-to-image
+
+
+ +
inpaint
+
+
+ +Image-to-image and inpainting are actually very similar tasks. Image-to-image takes an existing image and generates a new one that resembles it. Inpainting also takes an existing image, but it only transforms the image region defined by the mask and the rest of the image is unchanged. You can think of inpainting as a more precise tool for making changes and image-to-image has a broader scope for making more sweeping changes. + +## Control image generation + +Getting an image to look exactly the way you want is challenging because the denoising process is random. While you can control certain aspects of generation by configuring parameters like `negative_prompt`, there are better and more efficient methods for controlling image generation. + +### Prompt weighting + +Prompt weighting provides a quantifiable way to scale the representation of concepts in a prompt. You can use it to increase or decrease the magnitude of the text embedding vector for each concept in the prompt, which subsequently affects how much each concept is generated. The [Compel](https://github.com/damian0815/compel) library offers an intuitive syntax for scaling the prompt weights and generating the embeddings. Learn how to create the embeddings in the [Prompt weighting](../using-diffusers/weighted_prompts) guide. + +Once you've generated the embeddings, pass them to the `prompt_embeds` (and `negative_prompt_embeds` if you're using a negative prompt) parameter in the [`AutoPipelineForInpainting`]. The embeddings replaces the `prompt` parameter: + +```py +from diffusers import AutoPipelineForInpainting +import torch + +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +image = pipeline(prompt_emebds=prompt_embeds, # generated from Compel + negative_prompt_embeds, # generated from Compel + image=init_image, + mask_image=mask_image +).images[0] +``` + +### ControlNet + +ControlNet models provide an even more flexible and accurate way to control how an image is generated, and it is used with other diffusion models like Stable Diffusion. The ControlNet accepts an additional conditioning image input to guide the diffusion model to preserve the features in it. + +For example, let's condition an image with a ControlNet pretrained on inpaint images: + +```py +from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline +from diffusers.utils import load_image +import torch +import numpy as np + +controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, variant="fp16") + +pipeline = StableDiffusionControlNetInpaintPipeline.from_pretrained( + "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + +# prepare control image +def make_inpaint_condition(init_image, mask_image): + init_image = np.array(init_image.convert("RGB")).astype(np.float32) / 255.0 + mask_image = np.array(mask_image.convert("L")).astype(np.float32) / 255.0 + + assert init_image.shape[0:1] == mask_image.shape[0:1], "image and image_mask must have the same image size" + init_image[mask_image > 0.5] = -1.0 # set as masked pixel + init_image = np.expand_dims(init_image, 0).transpose(0, 3, 1, 2) + init_image = torch.from_numpy(init_image) + return init_image +control_image = make_inpaint_condition(init_image, mask_image) +``` + +Now generate an image from the base, mask and control images. You'll notice features of the base image are strongly preserved in the generated image. + +```py +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, control_image=control_image).images[0] +image +``` + +You can take this a step further and chain it with an image-to-image pipeline to apply a new style: + +```py +from diffusers import AutoPipelineForImage2Image + +pipeline = AutoPipelineForImage2Image.from_pretrained( + "nitrosocke/elden-ring-diffusion", torch_dtype=torch.float16, +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() + +prompt = "elden ring style castle" # include the token "elden ring style" in the prompt +negative_prompt = "bad architecture, deformed, disfigured, poor details" + +image = pipeline(prompt, negative_prompt=negative_prompt, image=image).images[0] +image +``` + +
+
+ +
initial image
+
+
+ +
ControlNet inpaint
+
+
+ +
image-to-image
+
+
+ +## Optimize + +It can be difficult and slow to run diffusion models if you're resource constrained, but it dosen't have to be with a few optimization tricks. One of the biggest (and easiest) optimizations you can enable is switching to a more memory-efficient attention. If you're using PyTorch 2.0, [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention) is automatically enabled and you don't need to do anything else. For non-PyTorch 2.0 users, you can install and use [xFormers](../optimization/xformers)'s implementation of memory-efficient attention. Both options will reduce memory usage and speed-up inference. + +You can also offload the model to the GPU to save even more memory: + +```diff ++ pipeline.enable_model_cpu_offload() ++ pipeline.enable_xformers_memory_efficient_attention() +``` + +Speed-up your inference code even more with [`torch_compile`](../optimization/torch2.0#torch.compile). You should wrap `torch.compile` around the most intensive component in the pipeline which is typically the UNet: + +```py +pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) +``` + +Learn more by reading the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides. \ No newline at end of file From b2c4e4d2ff3f91d63ead0c72423f9c631e6b5f66 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 27 Sep 2023 13:41:06 -0700 Subject: [PATCH 3/7] add section --- docs/source/en/using-diffusers/inpaint.md | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 7f390bd20d6b..6386a1050f61 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -277,6 +277,50 @@ image +## Preserve unmasked areas + +The [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) generally also changes the unmasked part of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same might result in some weird transitions between the unmasked and masked areas. + +```py +import PIL +import numpy as np +import torch + +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image + +device = "cuda" +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", + torch_dtype=torch.float16, +) +pipeline = pipeline.to(device) + +img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" +mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + +init_image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) + +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +repainted_image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] +repainted_image.save("repainted_image.png") + +# Convert mask to grayscale NumPy array +mask_image_arr = np.array(mask_image.convert("L")) +# Add a channel dimension to the end of the grayscale mask +mask_image_arr = mask_image_arr[:, :, None] +# Binarize the mask: 1s correspond to the pixels which are repainted +mask_image_arr = mask_image_arr.astype(np.float32) / 255.0 +mask_image_arr[mask_image_arr < 0.5] = 0 +mask_image_arr[mask_image_arr >= 0.5] = 1 + +# Take the masked pixels from the repainted image and the unmasked pixels from the initial image +unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image + mask_image_arr * repainted_image +unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8")) +unmasked_unchanged_image.save("force_unmasked_unchanged.png") +``` + ## Chained inpainting pipelines [`AutoPipelineForInpainting`] can be chained with other 🤗 Diffusers pipelines to edit their outputs. From 69c93e77f08f2bbf201bf128c5c8b4af279713ab Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 28 Sep 2023 12:02:54 -0700 Subject: [PATCH 4/7] edits --- docs/source/en/using-diffusers/inpaint.md | 73 +++++++++++------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 6386a1050f61..0e0f7c1432fc 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -14,16 +14,16 @@ specific language governing permissions and limitations under the License. [[open-in-colab]] -Inpainting replaces or edits specified areas of an image. This makes it a useful tool for restoring old images, removing defects and artifacts, or even replacing areas of an image with something entirely new. Inpainting relies on a mask to determine which regions of an image to fill in; the area to replace is represented by white pixels and the area to keep is represented by black pixels. Areas containing the missing pixels are filled in by the prompt. +Inpainting replaces or edits specific areas of an image. This makes it a useful tool for image restoration like removing defects and artifacts, or even replacing an image area with something entirely new. Inpainting relies on a mask to determine which regions of an image to fill in; the area to inpaint is represented by white pixels and the area to keep is represented by black pixels. The white pixels are filled in by the prompt. With 🤗 Diffusers, here is how you can quickly start inpainting: 1. Load an inpainting checkpoint with the [`AutoPipelineForInpainting`] class. This'll automatically detect the appropriate pipeline class to load based on the checkpoint: ```py +import torch from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image -import torch pipeline = AutoPipelineForInpainting.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 @@ -34,7 +34,7 @@ pipeline.enable_xformers_memory_efficient_attention() -You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention). +You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, it's not necessary to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention). @@ -51,7 +51,6 @@ mask_image = load_image("https://huggingface.co/datasets/huggingface/documentati prompt = "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k" negative_prompt = "bad anatomy, deformed, ugly, disfigured" image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask_image).images[0] -image ```
@@ -67,16 +66,16 @@ image ## Popular models -[Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion XL (SDXL)](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images thanks to an image prior model that creates better embeddings. +[Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion XL (SDXL)](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images thanks to an image prior model. ### Stable Diffusion v1.5 Stable Diffusion v1.5 is a latent diffusion model finetuned on 512x512 images. It is a good starting point for inpainting because it is relatively fast and generates good quality images. To use this model for inpainting, you'll need to pass a prompt, base and mask image to the pipeline: ```py +import torch from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image -import torch pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" @@ -95,12 +94,12 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generat ### Stable Diffusion XL (SDXL) -SDXL is a larger and more powerful version of Stable Diffusion v1.5. It employs a two-stage model process (though each model can also be used alone); the base model generates an image, and a refiner model takes that image and further enhances the details and quality of it. Take a look at the [SDXL](sdxl) guide for a more comprehensive guide on how to use SDXL and configure it's parameters. +SDXL is a larger and more powerful version of Stable Diffusion v1.5. This model can follow a two-stage model process (though each model can also be used alone); the base model generates an image, and a refiner model takes that image and further enhances its details and quality. Take a look at the [SDXL](sdxl) guide for a more comprehensive guide on how to use SDXL and configure it's parameters. ```py +import torch from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image -import torch pipeline = AutoPipelineForInpainting.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16" @@ -119,12 +118,12 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generat ### Kandinsky 2.2 -The Kandinsky model family is similar to SDXL in the sense that it uses two models; the image prior model generates image embeddings, and the diffusion model uses these embeddings to generate higher quality images. You can load the image prior and diffusion model separately, but the easiest way to use Kandinsky 2.2 is to load it into the [`AutoPipelineForInpainting`] class which uses the [`KandinskyV22InpaintCombinedPipeline`] under the hood. +The Kandinsky model family is similar to SDXL in the sense that it uses two models; the image prior model generates image embeddings, and the diffusion model uses these embeddings to generate images. You can load the image prior and diffusion model separately, but the easiest way to use Kandinsky 2.2 is to load it into the [`AutoPipelineForInpainting`] class which uses the [`KandinskyV22InpaintCombinedPipeline`] under the hood. ```py +import torch from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image -import torch pipeline = AutoPipelineForInpainting.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 @@ -162,11 +161,11 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generat ## Configure pipeline parameters -Image generation features - like quality and "creativity" - are dependent on pipeline parameters. Knowing what these parameters do is important for getting the results you want. Let's take a look at the most important parameters and see how changing them affects the generated image. +Image features - like quality and "creativity" - are dependent on pipeline parameters. Knowing what these parameters do is important for getting the results you want. Let's take a look at the most important parameters and see how changing them affects the output. ## Strength -`strength` is a measure of how much noise is added to the base image which means it'll affect how similar the generated image is to it. +`strength` is a measure of how much noise is added to the base image, which influences how similar the output is to the base image. * 📈 a high `strength` value means more noise is added to an image and the denoising process takes longer, but you'll get higher quality images that are more different from the base image * 📉 a low `strength` value means less noise is added to an image and the denoising process is faster, but the image quality may not be as great and the generated image resembles the base image more @@ -207,10 +206,10 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strengt ## Guidance scale -`guidance_scale` affects how aligned the text prompt and generated image are. +`guidance_scale` affects how aligned the text prompt and generated image are. -* 📈 a high `guidance_scale` value means the prompt and generated image are closely aligned so the output is a stricter interpretation of the prompt -* 📉 a low `guidance_scale` value means the prompt and generated image are more loosely aligned so the output may be more creative +* 📈 a high `guidance_scale` value means the prompt and generated image are closely aligned, so the output is a stricter interpretation of the prompt +* 📉 a low `guidance_scale` value means the prompt and generated image are more loosely aligned, so the output may be more varied from the prompt You can use `strength` and `guidance_scale` together for more granular control over how expressive the model is. For example, using high `strength` and `guidance_scale` values gives the model the most creative freedom. @@ -250,7 +249,7 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, guidanc ### Negative prompt -A negative prompt performs the opposite function of a prompt; it guides the model away from generating certain things in an image. This is useful for quickly improving image quality and preventing the model from generating things you don't want. +A negative prompt assumes the opposite role of a prompt; it guides the model away from generating certain things in an image. This is useful for quickly improving image quality and preventing the model from generating things you don't want. ```py import torch @@ -279,7 +278,7 @@ image ## Preserve unmasked areas -The [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) generally also changes the unmasked part of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same might result in some weird transitions between the unmasked and masked areas. +The [`AutoPipelineForInpainting`] (and other inpainting pipelines) generally changes the unmasked parts of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same may result in some weird transitions between the unmasked and masked areas. ```py import PIL @@ -327,9 +326,9 @@ unmasked_unchanged_image.save("force_unmasked_unchanged.png") ### Text-to-image-to-inpaint -Chaining a text-to-image and inpainting pipeline allows you to inpaint and edit the generated image without having to generate an entirely new one. +Chaining a text-to-image and inpainting pipeline allows you to inpaint the generated image, and you don't have to provide a base image to begin with. This makes it convenient to edit your favorite text-to-image outputs without having to generate an entirely new image. -Start by generating an image with the text-to-image pipeline: +Start with the text-to-image pipeline to create a castle: ```py import torch @@ -351,7 +350,7 @@ Load the mask image of the output from above: mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_text-chain-mask.png").convert("RGB") ``` -Let's replace the masked area with a waterfall: +And let's inpaint the masked area with a waterfall: ```py pipeline = AutoPipelineForInpainting.from_pretrained( @@ -379,9 +378,9 @@ image ### Image-to-image-to-inpaint -You can also chain an inpainting pipeline before or after an image-to-image pipeline. Depending on the position (first or last) of the inpainting pipeline in the chain, you can use it to make sure the image is ready for the image-to-image pipeline or you can use it to fix and modify the output from the image-to-image pipeline. This example uses the inpainting pipeline last. +You can also chain an inpainting pipeline before or after an image-to-image pipeline. Depending on the position (first or last) of the inpainting pipeline in the chain, you can use it to make sure the image is ready for the image-to-image pipeline or you can use it to edit its output. This example uses the inpainting pipeline last. -Begin by generating an image with the image-to-image pipeline: +Begin by generating an image of a castle with the image-to-image pipeline: ```py import torch @@ -399,7 +398,6 @@ init_image = load_image("https://huggingface.co/datasets/huggingface/documentati prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" # pass prompt and image to pipeline -generator = torch.Generator("cuda").manual_seed(41) image = pipeline(prompt, image=init_image, output_type="latent").images[0] ``` @@ -409,7 +407,7 @@ It is important to specify `output_type="latent"` in the pipeline to keep all th -Let's inpaint the tree on the right side of the image with some mountains: +Now let's inpaint the tree on the right side of the image with some mountains: ```py pipeline = AutoPipelineForInpainting.from_pretrained( @@ -420,8 +418,6 @@ pipeline.enable_xformers_memory_efficient_attention() mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/image-to-inpaint-chain-mask.png").convert("RGB") prompt = "digital painting of clouds above mountains, snowy, fantasy, soft" - -generator = torch.Generator("cuda").manual_seed(48) image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0] ``` @@ -440,7 +436,7 @@ image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0]
-Image-to-image and inpainting are actually very similar tasks. Image-to-image takes an existing image and generates a new one that resembles it. Inpainting also takes an existing image, but it only transforms the image region defined by the mask and the rest of the image is unchanged. You can think of inpainting as a more precise tool for making changes and image-to-image has a broader scope for making more sweeping changes. +Image-to-image and inpainting are actually very similar tasks. Image-to-image generates a new image that resembles the existing provided image. Inpainting does the same thing, but it only transforms the image area defined by the mask and the rest of the image is unchanged. You can think of inpainting as a more precise tool for making specific changes and image-to-image has a broader scope for making more sweeping changes. ## Control image generation @@ -448,13 +444,13 @@ Getting an image to look exactly the way you want is challenging because the den ### Prompt weighting -Prompt weighting provides a quantifiable way to scale the representation of concepts in a prompt. You can use it to increase or decrease the magnitude of the text embedding vector for each concept in the prompt, which subsequently affects how much each concept is generated. The [Compel](https://github.com/damian0815/compel) library offers an intuitive syntax for scaling the prompt weights and generating the embeddings. Learn how to create the embeddings in the [Prompt weighting](../using-diffusers/weighted_prompts) guide. +Prompt weighting provides a quantifiable way to scale the representation of concepts in a prompt. You can use it to increase or decrease the magnitude of the text embedding vector for each concept in the prompt, which subsequently determines how much of each concept is generated. The [Compel](https://github.com/damian0815/compel) library offers an intuitive syntax for scaling the prompt weights and generating the embeddings. Learn how to create the embeddings in the [Prompt weighting](../using-diffusers/weighted_prompts) guide. -Once you've generated the embeddings, pass them to the `prompt_embeds` (and `negative_prompt_embeds` if you're using a negative prompt) parameter in the [`AutoPipelineForInpainting`]. The embeddings replaces the `prompt` parameter: +Once you've generated the embeddings, pass them to the `prompt_embeds` (and `negative_prompt_embeds` if you're using a negative prompt) parameter in the [`AutoPipelineForInpainting`]. The embeddings replace the `prompt` parameter: ```py -from diffusers import AutoPipelineForInpainting import torch +from diffusers import AutoPipelineForInpainting pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, @@ -471,18 +467,20 @@ image = pipeline(prompt_emebds=prompt_embeds, # generated from Compel ### ControlNet -ControlNet models provide an even more flexible and accurate way to control how an image is generated, and it is used with other diffusion models like Stable Diffusion. The ControlNet accepts an additional conditioning image input to guide the diffusion model to preserve the features in it. +ControlNet models are used with other diffusion models like Stable Diffusion, and they provide an even more flexible and accurate way to control how an image is generated. A ControlNet accepts an additional conditioning image input that guides the diffusion model to preserve the features in it. For example, let's condition an image with a ControlNet pretrained on inpaint images: ```py -from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline -from diffusers.utils import load_image import torch import numpy as np +from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline +from diffusers.utils import load_image +# load ControlNet controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, variant="fp16") +# pass ControlNet to the pipeline pipeline = StableDiffusionControlNetInpaintPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16" ).to("cuda") @@ -503,6 +501,7 @@ def make_inpaint_condition(init_image, mask_image): init_image = np.expand_dims(init_image, 0).transpose(0, 3, 1, 2) init_image = torch.from_numpy(init_image) return init_image + control_image = make_inpaint_condition(init_image, mask_image) ``` @@ -549,19 +548,19 @@ image ## Optimize -It can be difficult and slow to run diffusion models if you're resource constrained, but it dosen't have to be with a few optimization tricks. One of the biggest (and easiest) optimizations you can enable is switching to a more memory-efficient attention. If you're using PyTorch 2.0, [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention) is automatically enabled and you don't need to do anything else. For non-PyTorch 2.0 users, you can install and use [xFormers](../optimization/xformers)'s implementation of memory-efficient attention. Both options will reduce memory usage and speed-up inference. +It can be difficult and slow to run diffusion models if you're resource constrained, but it dosen't have to be with a few optimization tricks. One of the biggest (and easiest) optimizations you can enable is switching to memory-efficient attention. If you're using PyTorch 2.0, [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention) is automatically enabled and you don't need to do anything else. For non-PyTorch 2.0 users, you can install and use [xFormers](../optimization/xformers)'s implementation of memory-efficient attention. Both options reduce memory usage and accelerate inference. You can also offload the model to the GPU to save even more memory: ```diff -+ pipeline.enable_model_cpu_offload() + pipeline.enable_xformers_memory_efficient_attention() ++ pipeline.enable_model_cpu_offload() ``` -Speed-up your inference code even more with [`torch_compile`](../optimization/torch2.0#torch.compile). You should wrap `torch.compile` around the most intensive component in the pipeline which is typically the UNet: +To speed-up your inference code even more, use [`torch_compile`](../optimization/torch2.0#torch.compile). You should wrap `torch.compile` around the most intensive component in the pipeline which is typically the UNet: ```py pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) ``` -Learn more by reading the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides. \ No newline at end of file +Learn more in the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides. \ No newline at end of file From ccb1fe736378a782d6cff28e14f4bac22ada1f44 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 2 Oct 2023 09:22:51 -0700 Subject: [PATCH 5/7] feedback --- docs/source/en/using-diffusers/inpaint.md | 87 +++++++++++++---------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 0e0f7c1432fc..4d99fca26eb6 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License. Inpainting replaces or edits specific areas of an image. This makes it a useful tool for image restoration like removing defects and artifacts, or even replacing an image area with something entirely new. Inpainting relies on a mask to determine which regions of an image to fill in; the area to inpaint is represented by white pixels and the area to keep is represented by black pixels. The white pixels are filled in by the prompt. -With 🤗 Diffusers, here is how you can quickly start inpainting: +With 🤗 Diffusers, here is how you can do inpainting: 1. Load an inpainting checkpoint with the [`AutoPipelineForInpainting`] class. This'll automatically detect the appropriate pipeline class to load based on the checkpoint: @@ -66,11 +66,11 @@ image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_imag ## Popular models -[Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion XL (SDXL)](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images thanks to an image prior model. +[Stable Diffusion Inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting), [Stable Diffusion XL (SDXL) Inpainting](https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images. -### Stable Diffusion v1.5 +### Stable Diffusion Inpainting -Stable Diffusion v1.5 is a latent diffusion model finetuned on 512x512 images. It is a good starting point for inpainting because it is relatively fast and generates good quality images. To use this model for inpainting, you'll need to pass a prompt, base and mask image to the pipeline: +Stable Diffusion Inpainting is a latent diffusion model finetuned on 512x512 images on inpainting. It is a good starting point because it is relatively fast and generates good quality images. To use this model for inpainting, you'll need to pass a prompt, base and mask image to the pipeline: ```py import torch @@ -92,7 +92,7 @@ prompt = "concept art digital painting of an elven castle, inspired by lord of t image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] ``` -### Stable Diffusion XL (SDXL) +### Stable Diffusion XL (SDXL) Inpainting SDXL is a larger and more powerful version of Stable Diffusion v1.5. This model can follow a two-stage model process (though each model can also be used alone); the base model generates an image, and a refiner model takes that image and further enhances its details and quality. Take a look at the [SDXL](sdxl) guide for a more comprehensive guide on how to use SDXL and configure it's parameters. @@ -102,7 +102,7 @@ from diffusers import AutoPipelineForInpainting from diffusers.utils import load_image pipeline = AutoPipelineForInpainting.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16" + "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16, variant="fp16" ).to("cuda") pipeline.enable_model_cpu_offload() pipeline.enable_xformers_memory_efficient_attention() @@ -116,9 +116,9 @@ prompt = "concept art digital painting of an elven castle, inspired by lord of t image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0] ``` -### Kandinsky 2.2 +### Kandinsky 2.2 Inpainting -The Kandinsky model family is similar to SDXL in the sense that it uses two models; the image prior model generates image embeddings, and the diffusion model uses these embeddings to generate images. You can load the image prior and diffusion model separately, but the easiest way to use Kandinsky 2.2 is to load it into the [`AutoPipelineForInpainting`] class which uses the [`KandinskyV22InpaintCombinedPipeline`] under the hood. +The Kandinsky model family is similar to SDXL because it uses two models as well; the image prior model creates image embeddings, and the diffusion model generates images from them. You can load the image prior and diffusion model separately, but the easiest way to use Kandinsky 2.2 is to load it into the [`AutoPipelineForInpainting`] class which uses the [`KandinskyV22InpaintCombinedPipeline`] under the hood. ```py import torch @@ -147,15 +147,15 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generat
-
Stable Diffusion v1.5
+
Stable Diffusion Inpainting
-
Stable Diffusion XL
+
Stable Diffusion XL Inpainting
-
Kandinsky 2.2
+
Kandinsky 2.2 Inpainting
@@ -211,7 +211,7 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strengt * 📈 a high `guidance_scale` value means the prompt and generated image are closely aligned, so the output is a stricter interpretation of the prompt * 📉 a low `guidance_scale` value means the prompt and generated image are more loosely aligned, so the output may be more varied from the prompt -You can use `strength` and `guidance_scale` together for more granular control over how expressive the model is. For example, using high `strength` and `guidance_scale` values gives the model the most creative freedom. +You can use `strength` and `guidance_scale` together for more control over how expressive the model is. For example, a combination high `strength` and `guidance_scale` values gives the model the most creative freedom. ```py import torch @@ -273,12 +273,15 @@ image ```
- +
+ +
negative_prompt = "bad architecture, unstable, poor details, blurry"
+
## Preserve unmasked areas -The [`AutoPipelineForInpainting`] (and other inpainting pipelines) generally changes the unmasked parts of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same may result in some weird transitions between the unmasked and masked areas. +The [`AutoPipelineForInpainting`] (and other inpainting pipelines) generally changes the unmasked parts of an image to create a more natural transition between the masked and unmasked region. If this behavior is undesirable, you can force the unmasked area to remain the same. However, forcing the unmasked portion of the image to remain the same may result in some unusual transitions between the unmasked and masked areas. ```py import PIL @@ -322,7 +325,7 @@ unmasked_unchanged_image.save("force_unmasked_unchanged.png") ## Chained inpainting pipelines -[`AutoPipelineForInpainting`] can be chained with other 🤗 Diffusers pipelines to edit their outputs. +[`AutoPipelineForInpainting`] can be chained with other 🤗 Diffusers pipelines to edit their outputs. This is often useful for improving the output quality from your other diffusion pipelines, and if you're using multiple pipelines, it can be more memory-efficient to chain them together to keep the outputs in latent space and reuse the same pipeline components. ### Text-to-image-to-inpaint @@ -376,29 +379,44 @@ image -### Image-to-image-to-inpaint +### Inpaint-to-image-to-image -You can also chain an inpainting pipeline before or after an image-to-image pipeline. Depending on the position (first or last) of the inpainting pipeline in the chain, you can use it to make sure the image is ready for the image-to-image pipeline or you can use it to edit its output. This example uses the inpainting pipeline last. +You can also chain an inpainting pipeline before another pipeline like image-to-image or an upscaler to improve the quality. -Begin by generating an image of a castle with the image-to-image pipeline: +Begin by inpainting an image: ```py import torch -from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting +from diffusers import AutoPipelineForInpainting, AutoPipelineForImage2Image from diffusers.utils import load_image -pipeline = AutoPipelineForImage2Image.from_pretrained( - "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" +pipeline = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" ).to("cuda") pipeline.enable_model_cpu_offload() pipeline.enable_xformers_memory_efficient_attention() -# prepare image +# load base and mask image init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB") + prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + +# resize image to 1024x1024 for SDXL +image = image.resize((1024, 1024)) +``` + +Now let's pass the image to another inpainting pipeline with SDXL's refiner model to enhance the image details and quality: + +```py +pipeline = AutoPipelineForInpainting.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipeline.enable_model_cpu_offload() +pipeline.enable_xformers_memory_efficient_attention() -# pass prompt and image to pipeline -image = pipeline(prompt, image=init_image, output_type="latent").images[0] +image = pipeline(prompt=prompt, image=image, mask_image=mask_image, output_type="latent").images[0] ``` @@ -407,18 +425,13 @@ It is important to specify `output_type="latent"` in the pipeline to keep all th -Now let's inpaint the tree on the right side of the image with some mountains: +Finally, you can pass this image to an image-to-image pipeline to put the finishing touches on it. It is more efficient to use the [`~AutoPipelineForImage2Image.from_pipe`] method to reuse the existing pipeline components, and avoid unnecessarily loading all the pipeline components into memory again. ```py -pipeline = AutoPipelineForInpainting.from_pretrained( - "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" -).to("cuda") -pipeline.enable_model_cpu_offload() +pipeline = AutoPipelineForImage2Image.from_pipe(pipeline) pipeline.enable_xformers_memory_efficient_attention() -mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/image-to-inpaint-chain-mask.png").convert("RGB") -prompt = "digital painting of clouds above mountains, snowy, fantasy, soft" -image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0] +image = pipeline(prompt=prompt, image=image).images[0] ```
@@ -427,12 +440,12 @@ image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0]
initial image
- -
image-to-image
+ +
inpaint
- -
inpaint
+ +
image-to-image
@@ -513,7 +526,7 @@ image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, control image ``` -You can take this a step further and chain it with an image-to-image pipeline to apply a new style: +You can take this a step further and chain it with an image-to-image pipeline to apply a new [style](https://huggingface.co/nitrosocke/elden-ring-diffusion): ```py from diffusers import AutoPipelineForImage2Image From d8bb7b280c144a62476fd40611edf2070b0479ea Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 2 Oct 2023 10:20:02 -0700 Subject: [PATCH 6/7] make fix-copies --- src/diffusers/schedulers/scheduling_unipc_multistep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 741b03b6d3a2..2a5468a00a1a 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -282,13 +282,13 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: https://arxiv.org/abs/2205.11487 """ dtype = sample.dtype - batch_size, channels, *remaining_dims = sample.shape + batch_size, channels, height, width = sample.shape if dtype not in (torch.float32, torch.float64): sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half # Flatten sample for doing quantile calculation along each image - sample = sample.reshape(batch_size, channels * np.prod(remaining_dims)) + sample = sample.reshape(batch_size, channels * height * width) abs_sample = sample.abs() # "a certain percentile absolute pixel value" @@ -299,7 +299,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" - sample = sample.reshape(batch_size, channels, *remaining_dims) + sample = sample.reshape(batch_size, channels, height, width) sample = sample.to(dtype) return sample From 764585abe428766a472de8c9ffe6efca0cade2eb Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 4 Oct 2023 10:30:26 -0700 Subject: [PATCH 7/7] rebase --- src/diffusers/schedulers/scheduling_unipc_multistep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 2a5468a00a1a..741b03b6d3a2 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -282,13 +282,13 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: https://arxiv.org/abs/2205.11487 """ dtype = sample.dtype - batch_size, channels, height, width = sample.shape + batch_size, channels, *remaining_dims = sample.shape if dtype not in (torch.float32, torch.float64): sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half # Flatten sample for doing quantile calculation along each image - sample = sample.reshape(batch_size, channels * height * width) + sample = sample.reshape(batch_size, channels * np.prod(remaining_dims)) abs_sample = sample.abs() # "a certain percentile absolute pixel value" @@ -299,7 +299,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" - sample = sample.reshape(batch_size, channels, height, width) + sample = sample.reshape(batch_size, channels, *remaining_dims) sample = sample.to(dtype) return sample