In [None]:
import diffusers
import torch
from diffusers import DiffusionPipeline, UNet2DConditionModel

import src.hooked_model.scheduler
from src.hooked_model.hooked_model import HookedDiffusionModel
from src.hooked_model.utils import get_timesteps

In [None]:
model_name = "sd-legacy/stable-diffusion-v1-5"

## How to use the hooked model interface with model from diffusers

In [None]:
pipe = DiffusionPipeline.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    use_safetensors=True,
).to("cuda")

In [None]:
model = UNet2DConditionModel.from_pretrained(
    model_name,
    subfolder="unet",
    torch_dtype=torch.float16,
    use_safetensors=True,
).to("cuda")

In [None]:
scheduler = src.hooked_model.scheduler.DDIMScheduler.from_config(pipe.scheduler.config)


In [None]:
prompt = "A photo of an astronaut riding a horse on mars"

You need to provide:
- denoiser model - either UNet or Transformer based, the assumption is that it should predict noise
- scheduler - it has to have certain fields and implement scale_model_input() and step() methods
- encode_prompt - function that encodes prompt into embeddings
- get_timesteps - function that returns discrete timesteps for the diffusion process given the number of inference steps
- vae - VAE model for latent space encoding/decoding, if latent space model is used

In [None]:
hooked_model = HookedDiffusionModel(
    model=model,
    scheduler=scheduler,
    encode_prompt=pipe.encode_prompt,
    get_timesteps=get_timesteps,
    vae=pipe.vae,
)

In [None]:
out = hooked_model(
    prompt=prompt,
    generator=torch.Generator(device="cuda").manual_seed(1),
    num_inference_steps=50,
    guidance_scale=7.5,
)

In [None]:
pipe.scheduler = diffusers.DDIMScheduler.from_config(pipe.scheduler.config)

out2 = pipe(
    prompt=prompt,
    generator=torch.Generator(device="cuda").manual_seed(1),
    num_inference_steps=50,
    guidance_scale=7.5,
)[0]

In [None]:
# out and out2 should be the exact same images

In [None]:
out[0]


In [None]:
out2[0]

## How to gather activations at specific positions in the model

All you need to do is to provide a list of positions you want to cache.

In [None]:
out, cache_dict = hooked_model.run_with_cache(
    prompt=prompt,
    num_images_per_prompt=1,
    device="cuda",
    guidance_scale=7.5,
    num_inference_steps=50,
    generator=torch.Generator(device="cuda").manual_seed(1),
    positions_to_cache=["up_blocks.1.attentions.1"],
)

In [None]:
out[0]

In [None]:
cache_dict["output"]["up_blocks.1.attentions.1"].shape