In [None]:
!pip install -qU diffusers accelerate transformers huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Inpainting

Inpainting replaces or edits specific areas of an image. This is useful to remove defects and artifacts, or even replace an image area with something entirely new.

Inpainting relies on a mask to determine which regions of an image to fill in; the area to inpaint is represented by white pixels and the area to keep is represented by black pixels. The white pixels are filled in by the prompt.

1. Load an inpainting checkpoint with the `AutoPipelineForInpainting` class.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'kandinsky-community/kandinsky-2-2-decoder-inpaint',
    torch_dtype=torch.float16,
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

2. Load the base and mask images

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

3. Create a prompt to inpaint the image with and pass it to the pipeline with the base and mask images:

In [None]:
prompt = "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k"
negative_prompt = "bad anatomy, deformed, ugly, disfigured"

image = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=init_image,
    mask_image=mask_image,
).images[0]

make_image_grid([init_image, mask_image, image], rows=1, cols=3)

## Create a mask image

The `blur` method provides an option for how to blend the original image and inpaint area. The amount of blur is determined by the `blur_factor` parameters.
* Increasing `blur_factor` increases the amount of blur applied to the mask edges, softening the transition between the original image and inpaint area.
* A low or zero `blur_factor` preserves the shaper edges of the mask.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image
from PIL import Image

pipeline = AutoPipelineForInpainting.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    torch_dtype=torch.float16,
).to('cuda')

In [None]:
mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore_mask.png")
blurred_mask = pipeline.mask_processor.blur(
    mask,
    blur_factor=33,
)

make_image_grid([mask, blurred_mask], rows=1, cols=2)

## Popular models

### Stable Diffusion Inpainting

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

### Stable Diffusion XL (SDXL) Inpainting

SDXL can follow a two-stage model process (though each model can also be used alone); the base model generates an image, and a refiner model takes that image and further enhances its details and quality.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'diffusers/stable-diffusion-xl-1.0-inpaint-0.1',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

### Kandinsky 2.2 Inpainting

The Kandinsky model family is similar to SDXL because it uses two models as well; the image prior model creates image embeddings, and the diffusion model generates images from them.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'kandinsky-community/kandinsky-2-2-decoder-inpaint',
    torch_dtype=torch.float16,
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

## Non-inpaint specific checkpoints

Compare the results of the regular checkpoints and the inpainting checkpoints.

##### `stable-diffusion-v1-5`

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, image], rows=1, cols=2)

##### `stable-diffusion-inpainting`

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, image], rows=1, cols=2)

#### Erase objects

##### `stable-diffusion-v1-5`

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "road"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, image], rows=1, cols=2)

##### `stable-diffusion-inpaint`

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "road"

generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
).images[0]
make_image_grid([init_image, image], rows=1, cols=2)

The inpaint specific checkpoints are intentionally trained to generate higher quality inpainted images, and that includes creating a more natural transition between the masked and unmasked areas. These checkpoints are more likely to change the unmasked area.

If preserving the unmasked area is important for our task, we should use the `apply_overlay` method to force the unmasked area of an image to remain the same at the expense of some more unnatural transitions between the masked and unmasked areas.

In [None]:
import PIL
import numpy as np
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

device = 'cuda'
pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
).to(device)

In [None]:
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"

init_image = load_image(img_url).resize((512, 512))
mask_image = load_image(mask_url).resize((512, 512))

prompt = "face of a yellow cat, high resolution, sitting on a park bench"
repainted_image = pipeline(
    prompt=prompt,
    image=init_image,
    mask_image=mask_image,
).images[0]

In [None]:
unmasked_unchanged_image = pipeline.image_processor.apply_overlay(
    mask_image,
    init_image,
    repainted_image
)
make_image_grid([init_image, mask_image, repainted_image, unmasked_unchanged_image], rows=2, cols=2)

## Configure pipeline parameters

### Strength

`strength` is a measure of how much noise is added to the base image, which influences how similar the output is to the base image.
* a high `strength` value means more noise is added to an image and the denoising process takes longer, but we will get higher quality images that are more different from the base image.
* a low `strength` value means less noise is added to an image and the denoising process is faster, but the image quality may not be as great and the generated image resembles the base image more.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.6,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.1,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.9,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

### Guidance scale

`guidance_scale` affects how aligned the text prompt and generated image are.
* a high `guidance_scale` value means the prompt and generated image are closely aligned, so the output is a strciter interpretation of the prompt.
* a low `guidance_scale` value means the prompt and generated image are more loosely aligned, so the output may be more varied from the prompt.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    guidance_scale=2.5,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
# low strength, low guidance_scale
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.1,
    guidance_scale=2.5,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
# low strength, high guidance_scale
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.1,
    guidance_scale=6.5,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
# high strength, low guidance_scale
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.7,
    guidance_scale=1.5,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
# high strength, high guidance_scale
image = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
    strength=0.7,
    guidance_scale=6.5,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

### Negative prompt

A negative prompt guides the model away from generating certain things in an image.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
negative_prompt = "bad architecture, unstable, poor details, blurry"

image = pipeline(
    prompt,
    negative_prompt=negative_prompt,
    image=init_image,
    mask_image=mask_image,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

### Padding mask crop

This method is used to increase the inpainting image quality. When enabled, the `padding_mask_crop` option crops the masked area with some user-specified padding and it will also crop the same area from the original image. Both the image and mask are upscaled to a higher resolution for inpainting, and then overlaid on the original image.

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid
from PIL import Image

generator = torch.Generator('cuda').manual_seed(111)
pipeline = AutoPipelineForInpainting.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    torch_dtype=torch.float16,
).to('cuda')

In [None]:
base = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png")
mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore_mask.png")

image = pipeline(
    'boat',
    image=base,
    mask_image=mask,
    strength=0.75,
    generator=generator,
    padding_mask_crop=32,
).images[0]
make_image_grid([base, mask, image], rows=1, cols=3)

## Chained inpainting pipelines

### Text-to-image-to-inpaint

In [None]:
import torch
from diffusers import AutoPipelineForText2Image, AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForText2Image.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    torch_dtype=torch.float16,
    variant='fp16',
    use_safetensors=True,
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
text2image = pipeline("concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k").images[0]

In [None]:
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_text-chain-mask.png")

pipeline = AutoPipelineForInpainting.from_pretrained(
    'kandinsky-community/kandinsky-2-2-decoder-inpaint',
    torch_dtype=torch.float16,
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
prompt = 'digital painting of a fantasy waterfall, cloudy'

image = pipeline(
    prompt,
    image=text2image,
    mask_image=mask_image,
).images[0]
make_image_grid([text2image, mask_image, image], rows=1, cols=3)

### Inpaint-to-image-to-image

In [None]:
import torch
from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

pipeline = AutoPipelineForInpainting.from_pretrained(
    'runwayml/stable-diffusion-inpainting',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")

prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"

image_inpainting = pipeline(
    prompt,
    image=init_image,
    mask_image=mask_image,
).images[0]
# resize image to 1024x1024 for SDXL
image_inpainting = image_inpainting.resize((1024, 1024))

In [None]:
# pass to another inpainting with SDXL's refiner to enhance image details
pipeline = AutoPipelineForInpainting.from_pretrained(
    'stabilityai/stable-diffusion-xl-refiner-1.0',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
image = pipeline(
    prompt,
    image=image_inpainting,
    mask_image=mask_image,
    output_type='latent',
).images[0]

Finally, we pass this image to an image-to-image pipeline to put the finishing touches on it. It is more efficient to use the `from_pipe()` method to reuse the existing pipeline components.

In [None]:
pipeline = AutoPipelineForImage2Image.from_pretrained(pipeline)
pipeline.enable_xformers_memory_efficient_attention()

In [None]:
image = pipeline(
    prompt,
    image=image,
).images[0]
make_image_grid([init_image, mask_image, image_inpainting, image], rows=1, cols=4)