from stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps
controlnet_path = f'path/to/instant/id'
# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
pipe = StableDiffusionXLReferencePipeline.from_pretrained(
"../path/to/model",
torch_dtype=torch.float16,
#use_safetensors=True,
variant="fp16").to('cuda')
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = StableDiffusionXLInstantIDPipeline(
pipe,
#vae = pipe.vae, # I tried both witout and with the VAE
text_encoder = pipe.text_encoder,
text_encoder_2 = pipe.text_encoder_2,
tokenizer = pipe.tokenizer,
tokenizer_2 = pipe.tokenizer_2,
unet = pipe.unet,
scheduler = pipe.scheduler,
feature_extractor = pipe.feature_extractor,
controlnet= [identityNet],
)
{
"name": "AttributeError",
"message": "'FrozenDict' object has no attribute 'block_out_channels'",
"stack": "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)\nCell \u001b[1;32mIn[3], line 28\u001b[0m\n\u001b[0;32m 20\u001b[0m pipe \u001b[38;5;241m=\u001b[39m StableDiffusionXLReferencePipeline\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[0;32m 21\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../models/StableDiffusion/RealvisXLv40_lightning\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 22\u001b[0m torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat16,\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m#use_safetensors=True,\u001b[39;00m\n\u001b[0;32m 24\u001b[0m variant\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfp16\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 26\u001b[0m pipe\u001b[38;5;241m.\u001b[39mscheduler \u001b[38;5;241m=\u001b[39m UniPCMultistepScheduler\u001b[38;5;241m.\u001b[39mfrom_config(pipe\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mconfig)\n\u001b[1;32m---> 28\u001b[0m pipe_instant \u001b[38;5;241m=\u001b[39m \u001b[43mStableDiffusionXLInstantIDPipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#vae = pipe.vae, \u001b[39;49;00m\n\u001b[0;32m 31\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_encoder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43munet\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#safety_checker = pipe.safety_checker,\u001b[39;49;00m\n\u001b[0;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontrolnet\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43midentityNet\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 40\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#torch_dtype=torch.float16\u001b[39;49;00m\n\u001b[0;32m 41\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 44\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124;03mresult_img = pipe_instant(ref_image=input_image,\u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124;03m prompt=\"1girl\",\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;124;03mresult_img.show()\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\nFile \u001b[1;32me:\\conda\\envs\\rayban\\lib\\site-packages\\diffusers\\pipelines\\controlnet\\pipeline_controlnet_sd_xl.py:211\u001b[0m, in \u001b[0;36mStableDiffusionXLControlNetPipeline.__init__\u001b[1;34m(self, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, unet, controlnet, scheduler, force_zeros_for_empty_prompt, add_watermarker, feature_extractor, image_encoder)\u001b[0m\n\u001b[0;32m 197\u001b[0m controlnet \u001b[38;5;241m=\u001b[39m MultiControlNetModel(controlnet)\n\u001b[0;32m 199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregister_modules(\n\u001b[0;32m 200\u001b[0m vae\u001b[38;5;241m=\u001b[39mvae,\n\u001b[0;32m 201\u001b[0m text_encoder\u001b[38;5;241m=\u001b[39mtext_encoder,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 209\u001b[0m image_encoder\u001b[38;5;241m=\u001b[39mimage_encoder,\n\u001b[0;32m 210\u001b[0m )\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvae\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_out_channels\u001b[49m) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 212\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol_image_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(\n\u001b[0;32m 214\u001b[0m vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 215\u001b[0m )\n\n\u001b[1;31mAttributeError\u001b[0m: 'FrozenDict' object has no attribute 'block_out_channels'"
}
Describe the bug
I cannot use both stable diffusion XL reference and Instant ID in the same pipeline. I get
'FrozenDict' object has no attribute 'block_out_channels'"Reproduction
Logs
System Info
diffusersversion: 0.25.0Who can help?
@yiyixuxu @sayakpaul @DN6 @stevhliu