In [None]:
!pip install -qU diffusers transformers accelerate

# Stable Diffusion XL Turbo

SDXL Turbo is an adversarial time-distilled SDXL model capable of running inference in as little as 1 step.

## Model checkpoints

In [None]:
from diffusers import AutoPipelineForText2Image
import torch

pipeline = AutoPipelineForText2Image.from_pretrained(
    'stabilityai/sdxl-turbo',
    torch_dtype=torch.float16,
    variant='fp16',
).to('cuda')

In [None]:
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
import torch

pipeline = StableDiffusionXLPipeline.from_single_file(
    "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors",
    torch_dtype=torch.float16,
    variant="fp16",
).to('cuda')

pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
    pipeline.scheduler.config,
    timestep_spacing='trailing'
)

## Text-to-image

SDXL Turbo generates 512x512 images for the best results.

Make sure to set `guidance_scale` to 0.0 as the model was trained without it.

In [None]:
from diffusers import AutoPipelineForText2Image
import torch

pipeline_text2image = AutoPipelineForText2Image.from_pretrained(
    'stabilityai/sdxl-turbo',
    torch_dtype=torch.float16,
    variant='fp16',
).to('cuda')

In [2]:
pipeline_text2image = AutoPipelineForText2Image.from_pretrained(
    'stabilityai/sdxl-turbo',
    torch_dtype=torch.float16,
    variant='fp16',
)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
from diffusers.utils import make_image_grid

prompt = "a cinematic shot of a baby racoon wearing an intricate italian priest robe"

images = []
for step in range(1,5):
    image = pipeline_text2image(
        prompt,
        guidance_scale=0.,
        num_inference_steps=step,
    ).imags[0]
    images.append(image)

make_image_grid(images, rows=1, cols=4)

## Image-to-image

For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1.

The image-to-image pipeline will run for `int(num_inference_steps * strength)` steps.

In [None]:
from diffusers import AutoPipelineForImage2Image
from diffusers.utils import load_image, make_image_grid

# use from_pipe to avoid consuming additional memory when loading a checkpoint
pipeline_image2image = AutoPipelineForImage2Image.from_pipe(pipeline).to('cuda')

In [None]:
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
init_image = init_image.resize((512, 512))

prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"

image = pipeline_image2image(
    prompt,
    image=init_image,
    strength=0.5,
    num_inference_steps=2,
    guidance_scale=0.0,
).images[0]
make_image_grid([init_image, image], rows=1, cols=2)

## Speed-up SDXL Turbo even more

* Compile the UNet if we are using PyTorch version 2.0 or higher

In [None]:
pipe.unet = torch.compile(
    pipe.unet,
    mode='reduce-overhead',
    fullgraph=True
)

* When using the default VAE, keep it in `float32` to avoid costly `dtype` conversions before and after each generation. We only need to do this one before our first generation:

In [None]:
pipe.upcast_vae()