In [1]:
from masactrl.diffuser_utils import MasaCtrlPipeline
from masactrl.masactrl_utils import AttentionStore
from masactrl.masactrl import MutualSelfAttentionControl
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline, DDIMScheduler
import torch
from PIL import Image
from torchvision.transforms import ToTensor
from torchvision.utils import save_image
from typing import Optional
from masactrl.masactrl_utils import AttentionBase
from masactrl.masactrl_utils import regiter_attention_editor_diffusers, register_attention_editor_controlnet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# seed
import torch
import torch.nn as nn
import numpy as np
import random

torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

In [3]:
class MasaCtrlControlNetPipeline(StableDiffusionControlNetPipeline):
    def enable_attention_control(self, editor):
        editor.reset()
        regiter_attention_editor_diffusers(self, editor)
        return self

    @torch.no_grad()
    def __call__(
        self,
        prompt=None,
        image=None,
        editor: Optional[AttentionBase] = None,
        **kwargs
    ):
        # Extract relevant arguments
        width = kwargs.get("width", 512)
        height = kwargs.get("height", 512)
        batch_size = kwargs.get("batch_size", 1)
        num_images_per_prompt = kwargs.get("num_images_per_prompt", 1)
        guidance_scale = kwargs.get("guidance_scale", 7.5)
        do_classifier_free_guidance = guidance_scale > 1.0

        # Preprocess control image
        if image is not None:
            image = self.prepare_image(
                image,
                width=width,
                height=height,
                batch_size=batch_size,
                num_images_per_prompt=num_images_per_prompt,
                device=self.device,
                dtype=self.unet.dtype,
                do_classifier_free_guidance=do_classifier_free_guidance,
            )
            kwargs["image"] = image

        # Inject attention hijacker
        if editor is not None:
            self.enable_attention_control(editor)

        # Run generation
        return super().__call__(
            prompt=prompt,
            **kwargs
        )

In [4]:
prompts = [
    "photo of a boy standing",  # source prompt (attention reference)
    "photo of a boy dancing"    # target prompt (edited version)
]

# Shared noise for spatial consistency
start_code = torch.randn([1, 4, 64, 64], device="cuda")
start_code = start_code.expand(len(prompts), -1, -1, -1)


In [5]:
scheduler = DDIMScheduler(
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False
)

controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-openpose"
).to("cuda")

pipe_edit = MasaCtrlControlNetPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    scheduler=scheduler,
    controlnet=controlnet,
    safety_checker=None,
    cross_attention_kwargs={"scale": 0.5}
).to("cuda")

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlControlNetPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
You have disabled the safety checker for <class '__main__.MasaCtrlControlNetPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the

In [6]:
# base control image (ex. standing)
control_image_pil = Image.open("/mnt/hdd/hbchoe/workspace/MasaCtrl/dataset/poses/flexing_03.png").convert("RGB").resize((512, 512))

In [7]:
editor_store = AttentionStore()
regiter_attention_editor_diffusers(pipe_edit, editor_store)
register_attention_editor_controlnet(pipe_edit.controlnet, editor_store)


image_ori = pipe_edit(
    prompt=prompts,
    image=control_image_pil,  # use image= not control_image=
    latents=start_code,
    guidance_scale=7.5,
    num_inference_steps=50,
)

[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:33<00:00,  1.51it/s]


In [8]:
import os
if not os.path.exists("elmo_control11"):
    os.makedirs("elmo_control11")
image_ori.images[0].save("elmo_control11/base_standing.png")   # original "standing"
image_ori.images[1].save("elmo_control11/base_dancing.png")    # normal "sitting" (no hijack)


In [9]:
import glob
import os

STEP = 4
LAYER = 10
# 여기서 control_image folder 디렉토리를 저장, 이후 폴더 내부의 control image들을 iterate하면서 sequential generation 실행.

folder_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/dataset/poses"
output_folder = "elmo_control11"
# 하나의 가능성: editor_store를 override하는게 맞을지도
control_image_files = sorted(glob.glob(f"{folder_path}/*.png"))
for file in control_image_files:
    control_image_2 = Image.open(file).convert("RGB").resize((512, 512))

    # Attention hijacking: source = prompt[0], target = prompt[1]
    editor_hijack = MutualSelfAttentionControl(0, 0)
    regiter_attention_editor_diffusers(pipe_edit, editor_hijack)
    register_attention_editor_controlnet(pipe_edit.controlnet, editor_hijack)


    image_masactrl = pipe_edit(
        prompt=prompts,
        image=control_image_2,
        latents=start_code,
        guidance_scale=7.5,
        num_inference_steps=50,
    )
    # Save the edited image
    file_name, file_ext = os.path.splitext(os.path.basename(file))
    # image_masactrl.images[0].save(f"{output_folder}/edited_elmo_{file_name}_0.png")
    image_masactrl.images[1].save(f"{output_folder}/edited_elmo_{file_name}.png")  # sitting with attention hijack


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:36<00:00,  1.39it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


MasaCtrl at denoising steps:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[✅ ControlNet] attention editor registered to 14 layers.


 48%|████▊     | 24/50 [00:18<00:20,  1.28it/s]


KeyboardInterrupt: 