In [None]:
!pip install -qU diffusers transformers accelerate opencv-python controlnet-aux

# T2I-Adapter

**T2I-Adapter** is a lightweight adapter for controlling and providing more accurate structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the text-to-image model and an external control signal, such as edge detection or depth estimation.

In the T2I-Adapter, the condition is passed to four feature extraction blocks and three downsample blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the text-to-image model.

T2I-Adapter is similar to ControlNet except it is smaller and faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse than ControlNet.

## Text-to-image

Text-to-image models rely on a text prompt to generate an image, but text alone may not be enough to provide more accurate structural guidance.

T2I-Adapter allows us to provide an additional control image to guide the generation process.

##### Stable Diffusion 1.5

In [None]:
import cv2
import numpy as np
from PIL import Image
from diffusers.utils import load_image, make_image_grid

original_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
image = np.array(original_image)

low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)
image = Image.fromarray(image)

In [None]:
from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
import torch

adapter = T2IAdapter.from_pretrained(
    'TencentARC/t2iadapter_canny_sd15v2',
    torch_dtype=torch.float16
)

pipeline = StableDiffusionAdapterPipeline.from_pretrained(
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    adapter=adapter,
    torch_dtype=torch.float16
).to('cuda')

In [None]:
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed"
generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=image,
    generator=generator
).images[0]
make_image_grid([original_image, image], 1, 2)

##### Stable Diffusion XL

In [None]:
from controlnet_aux.canny import CannyDetector
from diffusers.utils import load_image, make_image_grid

canny_detector = CannyDetector()

image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
image = canny_detector(
    image,
    detect_resolution=384,
    image_resolution=1024
)

In [None]:
from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
import torch

scheduler = EulerAncestralDiscreteScheduler.from_pretrained(
    'stabilityai/stable-diffusion-xl-base-1.0',
    subfolder='scheduler'
)

vae = AutoencoderKL.from_pretrained(
    'madebyollin/sdxl-vae-fp16-fix',
    torch_dtype=torch.float16
)

adapter = T2IAdapter.from_pretrained(
    'TencentARC/t2i-adapter-canny-sdxl-1.0',
    torch_dtype=torch.float16
)

pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
    'stabilityai/stable-diffusion-xl-base-1.0',
    vae=vae,
    adapter=adapter,
    scheduler=scheduler,
    torch_dtype=torch.float16,
    variant='fp16'
).to('cuda')

In [None]:
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed"
generator = torch.Generator('cuda').manual_seed(111)

image = pipeline(
    prompt,
    image=image,
    generator=generator
).images[0]
make_image_grid([original_image, image], 1, 2)

## MultiAdapter

T2I-Adapters are also composable, allowing us to use more than one adapter to impose multiple control conditions on an image.

In [None]:
from diffusers.utils import load_image, make_image_grid

pose_image = load_image(
    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
)
depth_image = load_image(
    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
)
cond = [pose_image, depth_image]
prompt = ["Santa Claus walking into an office room with a beautiful city view"]

make_image_grid(cond, 1, 2)

In [None]:
# Load the corresponding pose and depth adapters as a list
from diffusers import StableDiffusionAdapterPipelin, MultiAdapter, T2IAdapter
import torch

adapters = MultiAdapter(
    [
        T2IAdapter.from_pretrained('TencentARC/t2iadapter_keypose_sd14v1'),
        T2IAdapter.from_pretrained('TencentARC/t2iadapter_depth_sd14v1')
    ]
)
adapters = adapters.to(torch.float16)

In [None]:
pipeline = StableDiffusionAdapterPipeline.from_pretrained(
    'CompVis/stable-diffusion-v1-4',
    torch_dtype=torch.float16,
    adapter=adapters # pass all adapters here
).to('cuda')

In [None]:
image = pipeline(
    prompt,
    cond,
    adapter_conditioning_scale=[0.7, 0.7], # condition scales
).images[0]
image