From 9ee30e566e4c637a64b0fdc60589ce309aeb684c Mon Sep 17 00:00:00 2001 From: Aryan Date: Sun, 4 Aug 2024 00:30:42 +0200 Subject: [PATCH 1/2] allow sparsectrl to be loaded with single file --- src/diffusers/loaders/single_file_model.py | 3 +++ src/diffusers/loaders/single_file_utils.py | 14 ++++++++++++-- src/diffusers/models/controlnet_sparsectrl.py | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 92438620abd8..243e61eeed31 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -74,6 +74,9 @@ "MotionAdapter": { "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers, }, + "SparseControlNetModel": { + "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers, + }, } diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 483125f24825..69fcfde98b45 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -74,9 +74,11 @@ "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight", "stable_cascade_stage_c": "clip_txt_mapper.weight", "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias", - "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.1.pos_encoder.pe", + "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe", "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias", "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight", + "animatediff_scribble": "controlnet_cond_embedding.conv_in.weight", + "animatediff_rgb": "controlnet_cond_embedding.weight", } DIFFUSERS_DEFAULT_PIPELINE_PATHS = { @@ -110,6 +112,8 @@ "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"}, "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"}, "animatediff_sdxl_beta": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-sdxl-beta"}, + "animatediff_scribble": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-scribble"}, + "animatediff_rgb": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-rgb"}, } # Use to configure model sample size when original config is provided @@ -491,7 +495,13 @@ def infer_diffusers_model_type(checkpoint): model_type = "sd3" elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint: - if CHECKPOINT_KEY_NAMES["animatediff_v2"] in checkpoint: + if CHECKPOINT_KEY_NAMES["animatediff_scribble"] in checkpoint: + model_type = "animatediff_scribble" + + elif CHECKPOINT_KEY_NAMES["animatediff_rgb"] in checkpoint: + model_type = "animatediff_rgb" + + elif CHECKPOINT_KEY_NAMES["animatediff_v2"] in checkpoint: model_type = "animatediff_v2" elif checkpoint[CHECKPOINT_KEY_NAMES["animatediff_sdxl_beta"]].shape[-1] == 320: diff --git a/src/diffusers/models/controlnet_sparsectrl.py b/src/diffusers/models/controlnet_sparsectrl.py index cb577e33c670..7db793847232 100644 --- a/src/diffusers/models/controlnet_sparsectrl.py +++ b/src/diffusers/models/controlnet_sparsectrl.py @@ -20,6 +20,7 @@ from torch.nn import functional as F from ..configuration_utils import ConfigMixin, register_to_config +from ..loaders import FromOriginalModelMixin from ..utils import BaseOutput, logging from .attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, @@ -92,7 +93,7 @@ def forward(self, conditioning: torch.Tensor) -> torch.Tensor: return embedding -class SparseControlNetModel(ModelMixin, ConfigMixin): +class SparseControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin): """ A SparseControlNet model as described in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933). From 0a0ce1194eb5336d1bee0b0bf74e16b5e0242c69 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 5 Aug 2024 17:23:58 +0000 Subject: [PATCH 2/2] update --- src/diffusers/models/controlnet_sparsectrl.py | 2 ++ src/diffusers/models/unets/unet_motion_model.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/diffusers/models/controlnet_sparsectrl.py b/src/diffusers/models/controlnet_sparsectrl.py index 7db793847232..e91551c70953 100644 --- a/src/diffusers/models/controlnet_sparsectrl.py +++ b/src/diffusers/models/controlnet_sparsectrl.py @@ -315,6 +315,7 @@ def __init__( temporal_num_attention_heads=motion_num_attention_heads[i], temporal_max_seq_length=motion_max_seq_length, temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i], + temporal_double_self_attention=False, ) elif down_block_type == "DownBlockMotion": down_block = DownBlockMotion( @@ -332,6 +333,7 @@ def __init__( temporal_num_attention_heads=motion_num_attention_heads[i], temporal_max_seq_length=motion_max_seq_length, temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i], + temporal_double_self_attention=False, ) else: raise ValueError( diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index e96867bc3ed0..c1035b3ea38d 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -233,6 +233,7 @@ def __init__( temporal_cross_attention_dim: Optional[int] = None, temporal_max_seq_length: int = 32, temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1, + temporal_double_self_attention: bool = True, ): super().__init__() resnets = [] @@ -282,6 +283,7 @@ def __init__( positional_embeddings="sinusoidal", num_positional_embeddings=temporal_max_seq_length, attention_head_dim=out_channels // temporal_num_attention_heads[i], + double_self_attention=temporal_double_self_attention, ) ) @@ -384,6 +386,7 @@ def __init__( temporal_num_attention_heads: int = 8, temporal_max_seq_length: int = 32, temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1, + temporal_double_self_attention: bool = True, ): super().__init__() resnets = [] @@ -465,6 +468,7 @@ def __init__( positional_embeddings="sinusoidal", num_positional_embeddings=temporal_max_seq_length, attention_head_dim=out_channels // temporal_num_attention_heads, + double_self_attention=temporal_double_self_attention, ) )