In [None]:
!pip install -qU diffusers transformers accelerate peft

# Trajectory Consistency Distillation-LoRA

**Trajectory Consistency Distillation (TCD)** enables a model to generate higher quality and more detailed images with fewer steps. Owing to the effective error mitigation during the distillation process, TCD demonstrates superior performance even under conditions of large inference steps.

The major advantages of TCD are:
* Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of `DPM-Solver++(2S)` with SDXL. There is no additional discriminator or LPIPS supervision included during TCD training.
* Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
* Freely change detail level: During inference, the level of detail in the image can be adjusted with a single hyperparameter, *gamma*.

For large models like SDXL, TCD is trained with LoRA to reduce memory usage.

## General tasks

We will use `StableDiffusionXLPipeline` and `TCDScheduler`.

Notes for TCD-LoRA inference:
* Keep the `num_inference_steps` between 4 and 50
* Set `eta` (to control stochasticity at each step) between 0 and 1. We should use a higher `eta` when increasing the number of inference steps, but the downside is that a larger `eta` in `TCDScheduler` leads to blurrier images. A value of 0.3 is recommended to produce good results.

##### text-to-image

In [None]:
from diffusers import StableDiffusionXLPipeline, TCDScheduler
import torch

device = 'cuda'
base_model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
tcd_lora_id = 'h1t/TCD-SDXL-LoRA'

pipe = StableDiffusionXLPipeline.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    variant='fp16'
).to(device)

# swtich to TCDScheudler
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

# load TCS-LoRA
pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

In [None]:
prompt = "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    num_inference_steps=4,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
image

In [None]:
image = pipe(
    prompt,
    num_inference_steps=30,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
image

##### inpainting

In [None]:
from diffusers import AutoPipelineForInpainting, TCDScheduler
from diffusers.utils import load_image, make_image_grid
import torch

device = 'cuda'
base_model_id = 'diffusers/stable-diffusion-xl-1.0-inpainting-0.1'
tcd_lora_id = 'h1t/TCD-SDXL-LoRA'

pipe = AutoPipelineForInpainting.from_pretrained(
    base_model_id,
    torch_dtype=float16,
    variant='fp16'
).to(device)

pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

In [None]:
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"

init_image = load_image(img_url).resize((1024, 1024))
mask_image = load_image(mask_url).resize((1024, 1024))

prompt = 'a tiger sitting on a park bench'
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    image=init_image,
    mask_image=mask_image,
    num_inference_steps=8,
    guidance_scale=0,
    eta=0.3,
    strength=0.99 # make sure to use strength below 1.0
    generator=generator,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

In [None]:
image = pipe(
    prompt,
    image=init_image,
    mask_image=mask_image,
    num_inference_steps=30,
    guidance_scale=0,
    eta=0.3,
    strength=0.99 # make sure to use strength below 1.0
    generator=generator,
).images[0]
make_image_grid([init_image, mask_image, image], rows=1, cols=3)

## Community models

In [None]:
from diffusers import StableDiffusionXLPipeline, TCDScheduler
import torch

device = 'cuda'
# let's use a finetuned SDXL
base_model_id = 'cagliostrolab/animagine-xl-3.0'
tcd_lora_id = 'h1t/TCD-SDXL-LoRA'

pipe = StableDiffusionXLPipeline.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    variant='fp16'
).to(device)

pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

In [None]:
prompt = "A man, clad in a meticulously tailored military uniform, stands with unwavering resolve. The uniform boasts intricate details, and his eyes gleam with determination. Strands of vibrant, windswept hair peek out from beneath the brim of his cap."
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    num_infernence_steps=8,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
image

We can also combine other LoRA with TCD-LoRA

In [None]:
from diffusers import StableDiffusionXLPipeline, TCDScheduler
import torch

device = 'cuda'
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
styled_lora_id = "TheLastBen/Papercut_SDXL"

pipe = StableDiffusionXLPipeline.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    variant="fp16"
).to(device)
# switch to TCS scheduler
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
# load TCS-LoRA
pipe.load_lora_weights(tcd_lora_id, adapter_name="tcd")
# load style LoRA
pipe.load_lora_weights(styled_lora_id, adapter_name="style")
pipe.set_adapters(["tcd", "style"], adapter_weights=[1.0, 1.0])

In [None]:
prompt = "papercut of a winter mountain, snow"
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    num_inference_steps=4,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
image

## Adapters

### ControlNet

#### Depth ControlNet

In [None]:
import torch
import numpy as np
from PIL import Image
from transformers import DPTImageProcessor, DPTForDepthEstimation
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, TCDScheduler
from diffusers.utils import load_image, make_image_grid

device = 'cuda'

base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
controlnet_id = "diffusers/controlnet-depth-sdxl-1.0"
tcd_lora_id = "h1t/TCD-SDXL-LoRA"

# ControlNet
controlnet = ControlNetModel.from_pretrained(
    controlnet_id,
    torch_dtype=torch.float16,
    variant='fp16'
)

# SDXL
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    base_model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant='fp16'
).to(device)
pipe.enable_model_cpu_offload()

# TCD
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

In [None]:
# depth-map extractor
feature_extractor = DPTImageProcessor.from_pretrained('Intel/dpt-hybrid-midas')
depth_estimator = DPTForDepthEstimation.from_pretrained('Intel/dpt-hybrid-midas').to(device)


def get_depth_map(image):
    image = feature_extractor(
        image=image,
        return_tensors='pt'
    ).pixel_values.to(device)

    with torch.no_grad(), torch.autocast(device):
        depth_map = depth_estimator(image).predicted_depth

    depth_map = torch.nn.functional.interpolate(
        depth_map.unsqueeze(1),
        size=(1024, 1024),
        mode='bicubic',
        align_corners=False
    )

    depth_min = torch.amin(depth_map, dim=[1,2,3], keepdim=True)
    depth_max = torch.amax(depth_map, dim=[1,2,3], keepdim=True)
    depth_map = (depth_map - depth_min) / (depth_max - depth_min)

    image = torch.cat([depth_map] * 3, dim=1)
    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
    image = Image.fromarray((image * 255).clip(0, 255).astype(np.uint8))

    return image

In [None]:
init_image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
depth_image = get_depth_map(init_image)

In [None]:
prompt = 'stormtrooper lecture, photorealistic'
controlnet_conditioning_scale = 0.5
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    image=depth_image,
    controlnet_conditioning_scale=controlnet_conditioning_scale,
    num_inference_steps=4,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
make_image_grid([init_image, depth_image, image], rows=1, cols=3)

#### Canny ControlNet

In [None]:
import torch
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, TCDScheduler
from diffusers.utils import load_image, make_image_grid

device = 'cuda'

base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
controlnet_id = "diffusers/controlnet-canny-sdxl-1.0"
tcd_lora_id = "h1t/TCD-SDXL-LoRA"

# ControlNet
controlnet = ControlNetModel.from_pretrained(
    controlnet_id,
    torch_dtype=torch.float16,
    variant='fp16'
)

# SDXL
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    base_model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant='fp16'
).to(device)
pipe.enable_model_cpu_offload()

# TCD
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

In [None]:
canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png")

prompt = "ultrarealistic shot of a furry blue bird"
controlnet_conditioning_scale = 0.5
generator = torch.Generator(device).manual_seed(111)

image = pipe(
    prompt,
    image=canny_image,
    controlnet_conditioning_scale=controlnet_conditioning_scale,
    num_inference_steps=4,
    guidance_scale=0,
    eta=0.3,
    generator=generator,
).images[0]
make_image_grid([canny_image, image], rows=1, cols=2)

### IP-Adapter

In [None]:
pip install -qU git+https://github.com/tencent-ailab/IP-Adapter.git

In [3]:
import torch
from diffusers import StableDiffusionXLPipeline, TCDScheduler
from diffusers.utils import load_image, make_image_grid
from ip_adapter import IPAdapterXL

device = "cuda"
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
image_encoder_path = "sdxl_models/image_encoder"
ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
tcd_lora_id = "h1t/TCD-SDXL-LoRA"

pipe = StableDiffusionXLPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

pipe.load_lora_weights(tcd_lora_id)
pipe.fuse_lora()

# IP-Adapter
ip_model = IPAdapterXL(
    pipe,
    image_encoder_path,
    ip_ckpt,
    device
)

In [None]:
ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapter/main/assets/images/woman.png").resize((512, 512))
prompt = "best quality, high quality, wearing sunglasses"

image = ip_model.generate(
    pil_image=ref_image,
    prompt=prompt,
    scale=0.5,
    num_samples=1,
    num_inference_steps=4,
    guidance_scale=0,
    eta=0.3,
    seed=0,
)[0]

grid_image = make_image_grid([ref_image, image], rows=1, cols=2)

### AnimateDiff

In [None]:
import torch
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, TCDScheduler
from diffusers.utils import export_to_gif

adapter = MotionAdapter.from_pretrained(
    'guoyww/animatediff-motion-adapter-v1-5'
)

pipe = AnimateDiffPipeline.from_pretrained(
    'frankjoshua/toonyou_beta6',
    motion_adapter=adapter,
).to('cuda')

# TCD-LoRA
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights('h1t/TCD-SDXL-LoRA', adapter_name='tcd')

# Motion LoRA
pipe.load_lora_weights(
    'guoyww/animatediff-motion-lora-zoom-in',
    weight_name='diffusion_pytorch_model.safetensors'
    adapter_name='motion-lora'
)

pipe.set_adapters(['tcd', 'motion-lora'], adapter_weights=[1.0, 1.2])

In [None]:
prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
generator = torch.manual_seed(111)

frames = pipe(
    prompt,
    num_inference_steps=5,
    guidance_scale=0,
    cross_attention_kwargs={'scale': 1},
    num_frames=24,
    eta=0.3,
    generator=generator
).frames[0]
export_to_gif(frames, 'animation.gif')