diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 3113836f5d0a..5a7f70eb488a 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -72,7 +72,7 @@ class ControlNetImg2ImgPipelineFastTests( def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), + block_out_channels=(4, 8), layers_per_block=2, sample_size=32, in_channels=4, @@ -80,15 +80,17 @@ def get_dummy_components(self): down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, + norm_num_groups=1, ) torch.manual_seed(0) controlnet = ControlNetModel( - block_out_channels=(32, 64), + block_out_channels=(4, 8), layers_per_block=2, in_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), + norm_num_groups=1, ) torch.manual_seed(0) scheduler = DDIMScheduler( @@ -100,12 +102,13 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=[4, 8], in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( @@ -186,7 +189,7 @@ class StableDiffusionMultiControlNetPipelineFastTests( def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), + block_out_channels=(4, 8), layers_per_block=2, sample_size=32, in_channels=4, @@ -194,6 +197,7 @@ def get_dummy_components(self): down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, + norm_num_groups=1, ) torch.manual_seed(0) @@ -203,23 +207,25 @@ def init_weights(m): m.bias.data.fill_(1.0) controlnet1 = ControlNetModel( - block_out_channels=(32, 64), + block_out_channels=(4, 8), layers_per_block=2, in_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), + norm_num_groups=1, ) controlnet1.controlnet_down_blocks.apply(init_weights) torch.manual_seed(0) controlnet2 = ControlNetModel( - block_out_channels=(32, 64), + block_out_channels=(4, 8), layers_per_block=2, in_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), + norm_num_groups=1, ) controlnet2.controlnet_down_blocks.apply(init_weights) @@ -233,12 +239,13 @@ def init_weights(m): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=[4, 8], in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig(