From 958224c13e50cc95fa493d46515bf1dc6d372d70 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Sat, 17 Aug 2024 00:59:15 +0200
Subject: [PATCH 1/3] fix xformers tests

---
 src/diffusers/models/attention_processor.py   |  5 +++
 .../test_animatediff_controlnet.py            |  8 +++++
 .../test_animatediff_sparsectrl.py            |  8 +++++
 tests/pipelines/cogvideox/test_cogvideox.py   | 36 ++++++++++++++++++-
 tests/pipelines/test_pipelines_common.py      | 10 +++++-
 5 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index e2ab1606b345..7733e3d2a366 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -2011,6 +2011,11 @@ def __call__(
         key = attn.head_to_batch_dim(key).contiguous()
         value = attn.head_to_batch_dim(value).contiguous()
 
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
         hidden_states = xformers.ops.memory_efficient_attention(
             query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
         )
diff --git a/tests/pipelines/animatediff/test_animatediff_controlnet.py b/tests/pipelines/animatediff/test_animatediff_controlnet.py
index 72315bd0c965..3035fc1e3c61 100644
--- a/tests/pipelines/animatediff/test_animatediff_controlnet.py
+++ b/tests/pipelines/animatediff/test_animatediff_controlnet.py
@@ -20,6 +20,7 @@
 )
 from diffusers.models.attention import FreeNoiseTransformerBlock
 from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -329,6 +330,13 @@ def test_prompt_embeds(self):
         inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
         pipe(**inputs)
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
     def test_free_init(self):
         components = self.get_dummy_components()
         pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
diff --git a/tests/pipelines/animatediff/test_animatediff_sparsectrl.py b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
index 5d8a7228118d..e4cc06e1e797 100644
--- a/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
+++ b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
@@ -19,6 +19,7 @@
     UNetMotionModel,
 )
 from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -393,6 +394,13 @@ def test_prompt_embeds(self):
         inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
         pipe(**inputs)
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
     def test_free_init(self):
         components = self.get_dummy_components()
         pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
diff --git a/tests/pipelines/cogvideox/test_cogvideox.py b/tests/pipelines/cogvideox/test_cogvideox.py
index 3ae500eb9567..2196dcf86d40 100644
--- a/tests/pipelines/cogvideox/test_cogvideox.py
+++ b/tests/pipelines/cogvideox/test_cogvideox.py
@@ -30,7 +30,7 @@
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference, to_np
 
 
 enable_full_determinism()
@@ -275,6 +275,40 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
+    def test_xformers_attention_forwardGenerator_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4
+    ):
+        if not self.test_xformers_attention:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs)[0].permute(0, 1, 3, 4, 2)
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs)[0].permute(0, 1, 3, 4, 2)  # [B, F, H, W, C]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        if test_max_difference:
+            max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
+
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(to_np(output_with_offload[0][0]), to_np(output_without_offload[0][0]))
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index c3384e6b4664..cb6574802d00 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1687,7 +1687,15 @@ def _test_xformers_attention_forwardGenerator_pass(
             self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
 
         if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
+            if torch.is_tensor(output_without_offload):
+                if output_without_offload.ndim == 5:
+                    # Educated guess that the original format here is [B, F, C, H, W] and we
+                    # permute to [B, F, H, W, C] to make input compatible with mean pixel difference
+                    output_without_offload = output_without_offload.permute(0, 1, 3, 4, 2)[0]
+                    output_with_offload = output_with_offload.permute(0, 1, 3, 4, 2)[0]
+                output_without_offload = to_np(output_without_offload)
+                output_with_offload = to_np(output_with_offload)
+            assert_mean_pixel_difference(to_np(output_with_offload[0]), to_np(output_without_offload[0]))
 
     def test_progress_bar(self):
         components = self.get_dummy_components()

From a360039ac1a33a179455354dadb3ef5d952c6965 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Sat, 17 Aug 2024 01:03:10 +0200
Subject: [PATCH 2/3] remove unnecessary modifications to cogvideox tests

---
 tests/pipelines/cogvideox/test_cogvideox.py | 36 +--------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/tests/pipelines/cogvideox/test_cogvideox.py b/tests/pipelines/cogvideox/test_cogvideox.py
index 2196dcf86d40..3ae500eb9567 100644
--- a/tests/pipelines/cogvideox/test_cogvideox.py
+++ b/tests/pipelines/cogvideox/test_cogvideox.py
@@ -30,7 +30,7 @@
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference, to_np
+from ..test_pipelines_common import PipelineTesterMixin, to_np
 
 
 enable_full_determinism()
@@ -275,40 +275,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
-    def test_xformers_attention_forwardGenerator_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4
-    ):
-        if not self.test_xformers_attention:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_offload = pipe(**inputs)[0].permute(0, 1, 3, 4, 2)
-        output_without_offload = (
-            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
-        )
-
-        pipe.enable_xformers_memory_efficient_attention()
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_offload = pipe(**inputs)[0].permute(0, 1, 3, 4, 2)  # [B, F, H, W, C]
-        output_with_offload = (
-            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
-        )
-
-        if test_max_difference:
-            max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
-            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
-
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(to_np(output_with_offload[0][0]), to_np(output_without_offload[0][0]))
-
 
 @slow
 @require_torch_gpu

From 836bb0244fe195a9308b889b42eb636684c3b249 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 21 Aug 2024 22:48:11 +0200
Subject: [PATCH 3/3] update

---
 src/diffusers/models/attention_processor.py |  5 -----
 tests/pipelines/cogvideox/test_cogvideox.py |  4 ++++
 tests/pipelines/latte/test_latte.py         |  8 ++++++++
 tests/pipelines/test_pipelines_common.py    | 10 +---------
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index def6d6cd68f7..fc225567ddc1 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -1926,11 +1926,6 @@ def __call__(
         key = attn.head_to_batch_dim(key).contiguous()
         value = attn.head_to_batch_dim(value).contiguous()
 
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
         hidden_states = xformers.ops.memory_efficient_attention(
             query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
         )
diff --git a/tests/pipelines/cogvideox/test_cogvideox.py b/tests/pipelines/cogvideox/test_cogvideox.py
index 3ae500eb9567..17d0d8f21d5c 100644
--- a/tests/pipelines/cogvideox/test_cogvideox.py
+++ b/tests/pipelines/cogvideox/test_cogvideox.py
@@ -275,6 +275,10 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
+    @unittest.skip("xformers attention processor does not exist for CogVideoX")
+    def test_xformers_attention_forwardGenerator_pass(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py
index 94ff7fc0faf9..9667ebff249d 100644
--- a/tests/pipelines/latte/test_latte.py
+++ b/tests/pipelines/latte/test_latte.py
@@ -28,6 +28,7 @@
     LattePipeline,
     LatteTransformer3DModel,
 )
+from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     numpy_cosine_similarity_distance,
@@ -256,6 +257,13 @@ def test_save_load_optional_components(self):
         max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
         self.assertLess(max_diff, 1.0)
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index cb6574802d00..c3384e6b4664 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1687,15 +1687,7 @@ def _test_xformers_attention_forwardGenerator_pass(
             self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
 
         if test_mean_pixel_difference:
-            if torch.is_tensor(output_without_offload):
-                if output_without_offload.ndim == 5:
-                    # Educated guess that the original format here is [B, F, C, H, W] and we
-                    # permute to [B, F, H, W, C] to make input compatible with mean pixel difference
-                    output_without_offload = output_without_offload.permute(0, 1, 3, 4, 2)[0]
-                    output_with_offload = output_with_offload.permute(0, 1, 3, 4, 2)[0]
-                output_without_offload = to_np(output_without_offload)
-                output_with_offload = to_np(output_with_offload)
-            assert_mean_pixel_difference(to_np(output_with_offload[0]), to_np(output_without_offload[0]))
+            assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
 
     def test_progress_bar(self):
         components = self.get_dummy_components()