diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 868e2e5fae2c..c47ade034893 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -985,7 +985,7 @@ class TemporalConvLayer(nn.Module): dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use. """ - def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0): + def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0, norm_num_groups: int = 32): super().__init__() out_dim = out_dim or in_dim self.in_dim = in_dim @@ -993,22 +993,22 @@ def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = # conv layers self.conv1 = nn.Sequential( - nn.GroupNorm(32, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)) + nn.GroupNorm(norm_num_groups, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)) ) self.conv2 = nn.Sequential( - nn.GroupNorm(32, out_dim), + nn.GroupNorm(norm_num_groups, out_dim), nn.SiLU(), nn.Dropout(dropout), nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)), ) self.conv3 = nn.Sequential( - nn.GroupNorm(32, out_dim), + nn.GroupNorm(norm_num_groups, out_dim), nn.SiLU(), nn.Dropout(dropout), nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)), ) self.conv4 = nn.Sequential( - nn.GroupNorm(32, out_dim), + nn.GroupNorm(norm_num_groups, out_dim), nn.SiLU(), nn.Dropout(dropout), nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)), diff --git a/src/diffusers/models/unet_3d_blocks.py b/src/diffusers/models/unet_3d_blocks.py index e8e42cf5615f..97a1f1037c44 100644 --- a/src/diffusers/models/unet_3d_blocks.py +++ b/src/diffusers/models/unet_3d_blocks.py @@ -269,6 +269,7 @@ def __init__( in_channels, in_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ] attentions = [] @@ -316,6 +317,7 @@ def __init__( in_channels, in_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ) @@ -406,6 +408,7 @@ def __init__( out_channels, out_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ) attentions.append( @@ -529,6 +532,7 @@ def __init__( out_channels, out_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ) @@ -622,6 +626,7 @@ def __init__( out_channels, out_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ) attentions.append( @@ -764,6 +769,7 @@ def __init__( out_channels, out_channels, dropout=0.1, + norm_num_groups=resnet_groups, ) ) diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index 7356fb577584..f7ba9388a831 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -173,6 +173,7 @@ def __init__( attention_head_dim=attention_head_dim, in_channels=block_out_channels[0], num_layers=1, + norm_num_groups=norm_num_groups, ) # class embedding diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py index 933583ce4b70..e9f435239c92 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py @@ -62,8 +62,8 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) unet = UNet3DConditionModel( - block_out_channels=(32, 32), - layers_per_block=2, + block_out_channels=(4, 8), + layers_per_block=1, sample_size=32, in_channels=4, out_channels=4, @@ -71,6 +71,7 @@ def get_dummy_components(self): up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), cross_attention_dim=4, attention_head_dim=4, + norm_num_groups=2, ) scheduler = DDIMScheduler( beta_start=0.00085, @@ -81,13 +82,14 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=(32,), + block_out_channels=(8,), in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D"], latent_channels=4, sample_size=32, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( @@ -142,10 +144,11 @@ def test_text_to_video_default_case(self): image_slice = frames[0][-3:, -3:, -1] assert frames[0].shape == (32, 32, 3) - expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0]) + expected_slice = np.array([192.0, 44.0, 157.0, 140.0, 108.0, 104.0, 123.0, 144.0, 129.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.") def test_attention_slicing_forward_pass(self): self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3) diff --git a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py index b5fe3451774b..1785eb967f16 100644 --- a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py @@ -70,15 +70,16 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) unet = UNet3DConditionModel( - block_out_channels=(32, 64, 64, 64), - layers_per_block=2, + block_out_channels=(4, 8), + layers_per_block=1, sample_size=32, in_channels=4, out_channels=4, - down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"), - up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"), + down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"), + up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), cross_attention_dim=32, attention_head_dim=4, + norm_num_groups=2, ) scheduler = DDIMScheduler( beta_start=0.00085, @@ -89,13 +90,18 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=[ + 8, + ], in_channels=3, out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + down_block_types=[ + "DownEncoderBlock2D", + ], + up_block_types=["UpDecoderBlock2D"], latent_channels=4, - sample_size=128, + sample_size=32, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( @@ -154,7 +160,7 @@ def test_text_to_video_default_case(self): image_slice = frames[0][-3:, -3:, -1] assert frames[0].shape == (32, 32, 3) - expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131]) + expected_slice = np.array([162.0, 136.0, 132.0, 140.0, 139.0, 137.0, 169.0, 134.0, 132.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2