From 48e2de84ec7aa0963e4753fa337076022e17b119 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 20 Sep 2023 08:35:50 +0000
Subject: [PATCH 01/22] pipline fetcher

---
 .../fetch_torch_cuda_pipeline_test_matrix.py  | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 utils/fetch_torch_cuda_pipeline_test_matrix.py

diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
new file mode 100644
index 000000000000..97391f086d8f
--- /dev/null
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -0,0 +1,87 @@
+import os
+from collections import defaultdict
+
+from huggingface_hub import HfApi, ModelFilter
+
+import diffusers
+
+
+ALWAYS_TEST_PIPELINE_MODULES = [
+    "alt_diffusion",
+    "audio_diffusion",
+    "controlnet",
+    "consistency_models",
+    "dit",
+    "dance diffusion",
+    "stable_diffusion",
+    "stable_diffusion_2",
+    "stable_diffusion_xl",
+    "stable_unclip",
+    "karras_ve",
+    "deepfloyd_if",
+    "audioldm",
+    "audioldm2",
+    "musicldm",
+    "kandinsky",
+    "kandinsky_v22",
+    "shap_e",
+    "text_to_video",
+    "wuerstchen",
+    "vq_diffusion",
+]
+PIPELINE_USAGE_CUTOFF = os.getenv("PIPELINE_USAGE_CUTOFF", 10000)
+
+api = HfApi()
+filter = ModelFilter(library="diffusers")
+
+
+def filter_pipelines(usage_dict, usage_cutoff=10000):
+    output = []
+    for diffusers_object, usage in usage_dict.items():
+        if usage < usage_cutoff:
+            continue
+
+        if "Pipeline" in diffusers_object:
+            output.append(diffusers_object)
+
+    return output
+
+
+def fetch_pipeline_objects():
+    models = api.list_models(filter=filter)
+    downloads = defaultdict(int)
+
+    for model in models:
+        is_counted = False
+        for tag in model.tags:
+            if tag.startswith("diffusers:"):
+                is_counted = True
+                downloads[tag[len("diffusers:") :]] += model.downloads
+
+        if not is_counted:
+            downloads["other"] += model.downloads
+
+    # Remove 0 downloads
+    downloads = {k: v for k, v in downloads.items() if v > 0}
+    pipeline_objects = filter_pipelines(downloads, PIPELINE_USAGE_CUTOFF)
+
+    return pipeline_objects
+
+
+def main():
+    pipeline_objects = fetch_pipeline_objects()
+
+    test_modules = []
+    for pipeline_name in pipeline_objects:
+        module = getattr(diffusers, pipeline_name)
+        test_module = module.__module__.split(".")[-2]
+        test_modules.append(test_module)
+
+    test_modules.extend(ALWAYS_TEST_PIPELINE_MODULES)
+    # Get unique modules
+    test_modules = list(set(test_modules))
+    print(test_modules)
+
+
+if __name__ == "__main__":
+    main()

From e02bc30800b0659b01b1f439ea1bec680e86944b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 20 Sep 2023 09:46:22 +0000
Subject: [PATCH 02/22] update script

---
 utils/fetch_torch_cuda_pipeline_test_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
index 97391f086d8f..e353d4a4ed05 100644
--- a/utils/fetch_torch_cuda_pipeline_test_matrix.py
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -29,7 +29,7 @@
     "wuerstchen",
     "vq_diffusion",
 ]
-PIPELINE_USAGE_CUTOFF = os.getenv("PIPELINE_USAGE_CUTOFF", 10000)
+PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 10000))
 
 api = HfApi()
 filter = ModelFilter(library="diffusers")

From 299c0dade26ba51d55328be98479fb5ff368bf64 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 20 Sep 2023 11:43:42 +0000
Subject: [PATCH 03/22] clean up

---
 utils/fetch_torch_cuda_pipeline_test_matrix.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
index e353d4a4ed05..ff1bbcfff647 100644
--- a/utils/fetch_torch_cuda_pipeline_test_matrix.py
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -68,16 +68,22 @@ def fetch_pipeline_objects():
     return pipeline_objects
 
 
-def main():
+def fetch_pipeline_modules_to_test():
     pipeline_objects = fetch_pipeline_objects()
 
     test_modules = []
     for pipeline_name in pipeline_objects:
         module = getattr(diffusers, pipeline_name)
-        test_module = module.__module__.split(".")[-2]
+        test_module = module.__module__.split(".")[-2].strip()
         test_modules.append(test_module)
 
+    return test_modules
+
+
+def main():
+    test_modules = fetch_pipeline_modules_to_test()
     test_modules.extend(ALWAYS_TEST_PIPELINE_MODULES)
+
     # Get unique modules
     test_modules = list(set(test_modules))
     print(test_modules)

From 7901085aec0c0e2e63f63e798dc91d9dd1bb48d5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 20 Sep 2023 11:46:29 +0000
Subject: [PATCH 04/22] clean up

---
 utils/fetch_torch_cuda_pipeline_test_matrix.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
index ff1bbcfff647..ef8191701a46 100644
--- a/utils/fetch_torch_cuda_pipeline_test_matrix.py
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -1,3 +1,4 @@
+import json
 import os
 from collections import defaultdict
 
@@ -86,7 +87,7 @@ def main():
 
     # Get unique modules
     test_modules = list(set(test_modules))
-    print(test_modules)
+    print(json.dumps(test_modules))
 
 
 if __name__ == "__main__":

From 1ecf32652cc2eb68bd6c9f1f539bc7c71b7f5a5b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 20 Sep 2023 12:46:51 +0000
Subject: [PATCH 05/22] clean up

---
 utils/fetch_torch_cuda_pipeline_test_matrix.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
index ef8191701a46..d665ed81b756 100644
--- a/utils/fetch_torch_cuda_pipeline_test_matrix.py
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -6,29 +6,16 @@
 
 import diffusers
 
-
 ALWAYS_TEST_PIPELINE_MODULES = [
-    "alt_diffusion",
-    "audio_diffusion",
     "controlnet",
-    "consistency_models",
-    "dit",
-    "dance diffusion",
     "stable_diffusion",
     "stable_diffusion_2",
     "stable_diffusion_xl",
-    "stable_unclip",
-    "karras_ve",
     "deepfloyd_if",
-    "audioldm",
-    "audioldm2",
-    "musicldm",
     "kandinsky",
-    "kandinsky_v22",
-    "shap_e",
-    "text_to_video",
+    "kandinsky2_2",
+    "text_to_video_synthesis",
     "wuerstchen",
-    "vq_diffusion",
 ]
 PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 10000))
 

From 961dd86139535e3db185f28e7a7bc54bffdab107 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 21 Sep 2023 06:25:40 +0000
Subject: [PATCH 06/22] new pipeline runner

---
 .github/workflows/push_tests.yml              | 213 ++++++++--
 tests/pipelines/kandinsky2_2/__init__.py      |   0
 .../pipelines/kandinsky2_2/test_kandinsky.py  | 271 +++++++++++++
 .../kandinsky2_2/test_kandinsky_combined.py   | 365 ++++++++++++++++++
 .../kandinsky2_2/test_kandinsky_controlnet.py | 282 ++++++++++++++
 .../test_kandinsky_controlnet_img2img.py      | 303 +++++++++++++++
 .../kandinsky2_2/test_kandinsky_img2img.py    | 295 ++++++++++++++
 .../kandinsky2_2/test_kandinsky_inpaint.py    | 314 +++++++++++++++
 .../kandinsky2_2/test_kandinsky_prior.py      | 237 ++++++++++++
 .../test_kandinsky_prior_emb2emb.py           | 247 ++++++++++++
 .../text_to_video_synthesis/__init__.py       |   0
 .../test_text_to_video.py                     | 195 ++++++++++
 .../test_text_to_video_zero.py                |  42 ++
 .../test_video_to_video.py                    | 204 ++++++++++
 .../fetch_torch_cuda_pipeline_test_matrix.py  |  19 +-
 15 files changed, 2943 insertions(+), 44 deletions(-)
 create mode 100644 tests/pipelines/kandinsky2_2/__init__.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
 create mode 100644 tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
 create mode 100644 tests/pipelines/text_to_video_synthesis/__init__.py
 create mode 100644 tests/pipelines/text_to_video_synthesis/test_text_to_video.py
 create mode 100644 tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
 create mode 100644 tests/pipelines/text_to_video_synthesis/test_video_to_video.py

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index a13519ec5876..3f816bca7285 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,4 +1,4 @@
-name: Slow tests on main
+name: Slow Tests on main
 
 on:
   push:
@@ -12,53 +12,111 @@ env:
   MKL_NUM_THREADS: 8
   PYTEST_TIMEOUT: 600
   RUN_SLOW: yes
+  PIPELINE_USAGE_CUTOFF: 50000
 
 jobs:
-  run_slow_tests:
+  setup_torch_cuda_pipeline_matrix:
+    runs-on: docker-cpu
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install git+https://github.com/huggingface/accelerate.git
+
+      - name: Environment
+        run: |
+          python utils/print_env.py
+
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+
+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  torch_pipelines_cuda_tests:
+    needs: setup_torch_cuda_pipeline_matrix
     strategy:
       fail-fast: false
       max-parallel: 1
       matrix:
-        config:
-          - name: Slow PyTorch CUDA tests on Ubuntu
-            framework: pytorch
-            runner: docker-gpu
-            image: diffusers/diffusers-pytorch-cuda
-            report: torch_cuda
-          - name: Slow Flax TPU tests on Ubuntu
-            framework: flax
-            runner: docker-tpu
-            image: diffusers/diffusers-flax-tpu
-            report: flax_tpu
-          - name: Slow ONNXRuntime CUDA tests on Ubuntu
-            framework: onnxruntime
-            runner: docker-gpu
-            image: diffusers/diffusers-onnxruntime-cuda
-            report: onnx_cuda
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on: docker-gpu
+    framework: pytorch
     container:
-      image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install git+https://github.com/huggingface/accelerate.git
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "not Flax and not Onnx" \
+            --make-reports=tests_${{ matrix.module }}_cuda \
+            tests/pipelines/${{ matrix.module }}
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_${{ matrix.module }}_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ matrix.module }}_test_reports
+          path: reports
 
+  torch_cuda_tests:
+    runs-on: docker-gpu
+    framework: pytorch
+    report: torch_cuda
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
-
     steps:
     - name: Checkout diffusers
       uses: actions/checkout@v3
       with:
         fetch-depth: 2
 
-    - name: NVIDIA-SMI
-      if : ${{ matrix.config.runner == 'docker-gpu' }}
-      run: |
-        nvidia-smi
-
     - name: Install dependencies
       run: |
         apt-get update && apt-get install libsndfile1-dev libgl1 -y
@@ -70,47 +128,118 @@ jobs:
         python utils/print_env.py
 
     - name: Run slow PyTorch CUDA tests
-      if: ${{ matrix.config.framework == 'pytorch' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8 
-
+        CUBLAS_WORKSPACE_CONFIG: :16:8
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
+          --make-reports=tests_torch_cuda \
+          tests/models tests/schedulers tests/others
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ report }}_test_reports
+        path: reports
+
+  flax_tpu_tests:
+    runs-on: docker-tpu
+    framework: flax
+    report: flax_tpu
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
 
     - name: Run slow Flax TPU tests
-      if: ${{ matrix.config.framework == 'flax' }}
+      if: ${{ framework == 'flax' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 0 \
           -s -v -k "Flax" \
-          --make-reports=tests_${{ matrix.config.report }} \
+          --make-reports=tests_${{ report }} \
           tests/
 
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ report }}_test_reports
+        path: reports
+
+  onnx_cuda_tests:
+    runs-on: docker-gpu
+    framework: onnxruntime
+    report: onnx_cuda
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
     - name: Run slow ONNXRuntime CUDA tests
-      if: ${{ matrix.config.framework == 'onnxruntime' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
+          --make-reports=tests_${{ report }} \
           tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+      run: cat reports/tests_${{ report }}_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: ${{ matrix.config.report }}_test_reports
+        name: ${{ report }}_test_reports
         path: reports
 
   run_examples_tests:
diff --git a/tests/pipelines/kandinsky2_2/__init__.py b/tests/pipelines/kandinsky2_2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky.py b/tests/pipelines/kandinsky2_2/test_kandinsky.py
new file mode 100644
index 000000000000..65dbf0a708eb
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Pipeline
+    params = [
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.3420, 0.9505, 0.3919, 1.0000, 0.5188, 0.3109, 0.6139, 0.5624, 0.6811])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_text2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_text2img_cat_fp16.npy"
+        )
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22Pipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "red cat, 4k photo"
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
new file mode 100644
index 000000000000..b90f59cc4966
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -0,0 +1,365 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import (
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Img2ImgCombinedPipeline,
+    KandinskyV22InpaintCombinedPipeline,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+from .test_kandinsky import Dummies
+from .test_kandinsky_img2img import Dummies as Img2ImgDummies
+from .test_kandinsky_inpaint import Dummies as InpaintDummies
+from .test_kandinsky_prior import Dummies as PriorDummies
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22CombinedPipeline
+    params = [
+        "prompt",
+    ]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = True
+
+    def get_dummy_components(self):
+        dummy = Dummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.3013, 0.0471, 0.5176, 0.1817, 0.2566, 0.7076, 0.6712, 0.4421, 0.7503])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+
+class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "negative_prompt", "image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = Img2ImgDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = Img2ImgDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.4353, 0.4710, 0.5128, 0.4806, 0.5054, 0.5348, 0.5224, 0.4603, 0.5025])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+
+class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintCombinedPipeline
+    params = ["prompt", "image", "mask_image"]
+    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummy = InpaintDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = InpaintDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.5039, 0.4926, 0.4898, 0.4978, 0.4838, 0.4942, 0.4738, 0.4702, 0.4816])
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_dict_tuple_outputs_equivalent(self):
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
new file mode 100644
index 000000000000..cec209c7cfec
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetPipeline
+    params = ["image_embeds", "negative_image_embeds", "hint"]
+    batch_params = ["image_embeds", "negative_image_embeds", "hint"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.6959826, 0.868279, 0.7558092, 0.68769467, 0.85805804, 0.65977496, 0.44885302, 0.5959111, 0.4251595]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=1e-1)
+
+
+@nightly
+@require_torch_gpu
+class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy"
+        )
+
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "A robot, 4k photo"
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint=hint,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py
new file mode 100644
index 000000000000..0c7b99580085
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetImg2ImgPipeline,
+    KandinskyV22PriorEmb2EmbPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetImg2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "hint"]
+    batch_params = ["image_embeds", "negative_image_embeds", "image", "hint"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.54985034, 0.55509365, 0.52561504, 0.5570494, 0.5593818, 0.5263979, 0.50285643, 0.5069846, 0.51196736]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1.75e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_img2img_robotcat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        init_image = init_image.resize((512, 512))
+
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        prompt = "A robot, 4k photo"
+
+        pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            image=init_image,
+            strength=0.85,
+            generator=generator,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint=hint,
+            generator=generator,
+            num_inference_steps=100,
+            height=512,
+            width=512,
+            strength=0.5,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
new file mode 100644
index 000000000000..9a5b596def58
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
@@ -0,0 +1,295 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22Img2ImgPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.5712, 0.5443, 0.4725, 0.6195, 0.5184, 0.4651, 0.4473, 0.4590, 0.5016])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=2e-1)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_img2img_frog.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
new file mode 100644
index 000000000000..f40ec0d1f070
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22InpaintPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 32
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 9,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create mask
+        mask = np.zeros((64, 64), dtype=np.float32)
+        mask[:32, :32] = 1
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "mask_image": mask,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_inpaint(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.50775903, 0.49527195, 0.48824543, 0.50192237, 0.48644906, 0.49373814, 0.4780598, 0.47234827, 0.48327848]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=5e-1)
+
+    def test_model_cpu_offload_forward_pass(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=5e-4)
+
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_inpaint(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_inpaint_cat_with_hat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        mask = np.zeros((768, 768), dtype=np.float32)
+        mask[:250, 250:-250] = 1
+
+        prompt = "a hat"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22InpaintPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            mask_image=mask,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
new file mode 100644
index 000000000000..a0de5cceeb75
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Dummies:
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+
+class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+
+    def test_kandinsky_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
new file mode 100644
index 000000000000..89b603e9fc1d
--- /dev/null
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorEmb2EmbPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "image"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "strength",
+        "generator",
+        "num_inference_steps",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "strength": 0.5,
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_prior_emb2emb(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [
+                0.1071284,
+                1.3330271,
+                0.61260223,
+                -0.6691065,
+                -0.3846852,
+                -1.0303661,
+                0.22716111,
+                0.03348901,
+                0.30040675,
+                -0.24805029,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
diff --git a/tests/pipelines/text_to_video_synthesis/__init__.py b/tests/pipelines/text_to_video_synthesis/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
new file mode 100644
index 000000000000..2c47dc492da1
--- /dev/null
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    TextToVideoSDPipeline,
+    UNet3DConditionModel,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = TextToVideoSDPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    # No `output_type`.
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(32, 32),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=4,
+            attention_head_dim=4,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(32,),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=32,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = TextToVideoSDPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = sd_pipe(**inputs).frames
+        image_slice = frames[0][-3:, -3:, -1]
+
+        assert frames[0].shape == (32, 32, 3)
+        expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
+    def test_num_images_per_prompt(self):
+        pass
+
+    def test_progress_bar(self):
+        return super().test_progress_bar()
+
+
+@slow
+@skip_mps
+@require_torch_gpu
+class TextToVideoSDPipelineSlowTests(unittest.TestCase):
+    def test_two_step_model(self):
+        expected_video = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
+        )
+
+        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
+        pipe = pipe.to(torch_device)
+
+        prompt = "Spiderman is surfing"
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames
+        video = video_frames.cpu().numpy()
+
+        assert np.abs(expected_video - video).mean() < 5e-2
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
new file mode 100644
index 000000000000..02fb43a0b65b
--- /dev/null
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
@@ -0,0 +1,42 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import DDIMScheduler, TextToVideoZeroPipeline
+from diffusers.utils.testing_utils import load_pt, require_torch_gpu, slow
+
+from ..test_pipelines_common import assert_mean_pixel_difference
+
+
+@slow
+@require_torch_gpu
+class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
+    def test_full_model(self):
+        model_id = "runwayml/stable-diffusion-v1-5"
+        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        generator = torch.Generator(device="cuda").manual_seed(0)
+
+        prompt = "A bear is playing a guitar on Times Square"
+        result = pipe(prompt=prompt, generator=generator).images
+
+        expected_result = load_pt(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
+        )
+
+        assert_mean_pixel_difference(result, expected_result)
diff --git a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py
new file mode 100644
index 000000000000..f057eb34997e
--- /dev/null
+++ b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    UNet3DConditionModel,
+    VideoToVideoSDPipeline,
+)
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    skip_mps,
+    slow,
+    torch_device,
+)
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+@skip_mps
+class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = VideoToVideoSDPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"}
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    test_attention_slicing = False
+
+    # No `output_type`.
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(32, 64, 64, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=32,
+            attention_head_dim=4,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=True,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        # 3 frames
+        video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "video": video,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = VideoToVideoSDPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        frames = sd_pipe(**inputs).frames
+        image_slice = frames[0][-3:, -3:, -1]
+
+        assert frames[0].shape == (32, 32, 3)
+        expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_save_load_optional_components(self):
+        super().test_save_load_optional_components(expected_max_difference=0.001)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3)
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    # (todo): sayakpaul
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
+    def test_num_images_per_prompt(self):
+        pass
+
+    def test_progress_bar(self):
+        return super().test_progress_bar()
+
+
+@slow
+@skip_mps
+class VideoToVideoSDPipelineSlowTests(unittest.TestCase):
+    def test_two_step_model(self):
+        pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16)
+        pipe.enable_model_cpu_offload()
+
+        # 10 frames
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        video = torch.randn((1, 10, 3, 1024, 576), generator=generator)
+        video = video.to("cuda")
+
+        prompt = "Spiderman is surfing"
+
+        video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="pt").frames
+
+        expected_array = np.array([-1.0458984, -1.1279297, -0.9663086, -0.91503906, -0.75097656])
+        assert np.abs(video_frames.cpu().numpy()[0, 0, 0, 0, -5:] - expected_array).sum() < 1e-2
diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py
index d665ed81b756..41a9c1c8270d 100644
--- a/utils/fetch_torch_cuda_pipeline_test_matrix.py
+++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py
@@ -1,11 +1,15 @@
 import json
+import logging
 import os
 from collections import defaultdict
+from pathlib import Path
 
 from huggingface_hub import HfApi, ModelFilter
 
 import diffusers
 
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
 ALWAYS_TEST_PIPELINE_MODULES = [
     "controlnet",
     "stable_diffusion",
@@ -17,8 +21,9 @@
     "text_to_video_synthesis",
     "wuerstchen",
 ]
-PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 10000))
+PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 50000))
 
+logger = logging.getLogger(__name__)
 api = HfApi()
 filter = ModelFilter(library="diffusers")
 
@@ -57,7 +62,11 @@ def fetch_pipeline_objects():
 
 
 def fetch_pipeline_modules_to_test():
-    pipeline_objects = fetch_pipeline_objects()
+    try:
+        pipeline_objects = fetch_pipeline_objects()
+    except Exception as e:
+        logger.error(e)
+        raise RuntimeError("Unable to fetch model list from HuggingFace Hub.")
 
     test_modules = []
     for pipeline_name in pipeline_objects:
@@ -76,6 +85,12 @@ def main():
     test_modules = list(set(test_modules))
     print(json.dumps(test_modules))
 
+    save_path = f"{PATH_TO_REPO}/reports"
+    os.makedirs(save_path, exist_ok=True)
+
+    with open(f"{save_path}/test-pipelines.json", "w") as f:
+        json.dump({"pipeline_test_modules": test_modules}, f)
+
 
 if __name__ == "__main__":
     main()

From 9c0f65feb172433bfd59a7732e6732f747abad8d Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 21 Sep 2023 06:26:56 +0000
Subject: [PATCH 07/22] rename tests to match modules

---
 tests/pipelines/kandinsky_v22/__init__.py     |   0
 .../pipelines/kandinsky_v22/test_kandinsky.py | 271 -------------
 .../kandinsky_v22/test_kandinsky_combined.py  | 365 ------------------
 .../test_kandinsky_controlnet.py              | 282 --------------
 .../test_kandinsky_controlnet_img2img.py      | 303 ---------------
 .../kandinsky_v22/test_kandinsky_img2img.py   | 295 --------------
 .../kandinsky_v22/test_kandinsky_inpaint.py   | 314 ---------------
 .../kandinsky_v22/test_kandinsky_prior.py     | 237 ------------
 .../test_kandinsky_prior_emb2emb.py           | 247 ------------
 tests/pipelines/text_to_video/__init__.py     |   0
 .../text_to_video/test_text_to_video.py       | 195 ----------
 .../text_to_video/test_text_to_video_zero.py  |  42 --
 .../text_to_video/test_video_to_video.py      | 204 ----------
 13 files changed, 2755 deletions(-)
 delete mode 100644 tests/pipelines/kandinsky_v22/__init__.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
 delete mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
 delete mode 100644 tests/pipelines/text_to_video/__init__.py
 delete mode 100644 tests/pipelines/text_to_video/test_text_to_video.py
 delete mode 100644 tests/pipelines/text_to_video/test_text_to_video_zero.py
 delete mode 100644 tests/pipelines/text_to_video/test_video_to_video.py

diff --git a/tests/pipelines/kandinsky_v22/__init__.py b/tests/pipelines/kandinsky_v22/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
deleted file mode 100644
index 65dbf0a708eb..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_numpy,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-enable_full_determinism()
-
-
-class Dummies:
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 32
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 4,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "addition_embed_type": "image",
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "encoder_hid_dim": self.text_embedder_hidden_size,
-            "encoder_hid_dim_type": "image_proj",
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None,
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 64],
-            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": [
-                "AttnUpDecoderBlock2D",
-                "UpDecoderBlock2D",
-            ],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        unet = self.dummy_unet
-        movq = self.dummy_movq
-
-        scheduler = DDIMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-            prediction_type="epsilon",
-            thresholding=False,
-        )
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
-            device
-        )
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "guidance_scale": 4.0,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
-
-class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22Pipeline
-    params = [
-        "image_embeds",
-        "negative_image_embeds",
-    ]
-    batch_params = ["image_embeds", "negative_image_embeds"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_inputs(self, device, seed=0):
-        dummies = Dummies()
-        return dummies.get_dummy_inputs(device=device, seed=seed)
-
-    def get_dummy_components(self):
-        dummies = Dummies()
-        return dummies.get_dummy_components()
-
-    def test_kandinsky(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.3420, 0.9505, 0.3919, 1.0000, 0.5188, 0.3109, 0.6139, 0.5624, 0.6811])
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=1e-1)
-
-
-@slow
-@require_torch_gpu
-class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_text2img(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/kandinskyv22_text2img_cat_fp16.npy"
-        )
-
-        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
-        )
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyV22Pipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        prompt = "red cat, 4k photo"
-
-        generator = torch.Generator(device="cuda").manual_seed(0)
-        image_emb, zero_image_emb = pipe_prior(
-            prompt,
-            generator=generator,
-            num_inference_steps=5,
-            negative_prompt="",
-        ).to_tuple()
-
-        generator = torch.Generator(device="cuda").manual_seed(0)
-        output = pipeline(
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            generator=generator,
-            num_inference_steps=100,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
deleted file mode 100644
index b90f59cc4966..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from diffusers import (
-    KandinskyV22CombinedPipeline,
-    KandinskyV22Img2ImgCombinedPipeline,
-    KandinskyV22InpaintCombinedPipeline,
-)
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
-
-from ..test_pipelines_common import PipelineTesterMixin
-from .test_kandinsky import Dummies
-from .test_kandinsky_img2img import Dummies as Img2ImgDummies
-from .test_kandinsky_inpaint import Dummies as InpaintDummies
-from .test_kandinsky_prior import Dummies as PriorDummies
-
-
-enable_full_determinism()
-
-
-class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22CombinedPipeline
-    params = [
-        "prompt",
-    ]
-    batch_params = ["prompt", "negative_prompt"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = True
-
-    def get_dummy_components(self):
-        dummy = Dummies()
-        prior_dummy = PriorDummies()
-        components = dummy.get_dummy_components()
-
-        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        prior_dummy = PriorDummies()
-        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
-        inputs.update(
-            {
-                "height": 64,
-                "width": 64,
-            }
-        )
-        return inputs
-
-    def test_kandinsky(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.3013, 0.0471, 0.5176, 0.1817, 0.2566, 0.7076, 0.6712, 0.4421, 0.7503])
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    @require_torch_gpu
-    def test_offloads(self):
-        pipes = []
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components).to(torch_device)
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
-        pipes.append(sd_pipe)
-
-        image_slices = []
-        for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
-            image = pipe(**inputs).images
-
-            image_slices.append(image[0, -3:, -3:, -1].flatten())
-
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
-        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=1e-1)
-
-    def test_dict_tuple_outputs_equivalent(self):
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
-
-    def test_model_cpu_offload_forward_pass(self):
-        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
-
-
-class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
-    params = ["prompt", "image"]
-    batch_params = ["prompt", "negative_prompt", "image"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        dummy = Img2ImgDummies()
-        prior_dummy = PriorDummies()
-        components = dummy.get_dummy_components()
-
-        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        prior_dummy = PriorDummies()
-        dummy = Img2ImgDummies()
-        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
-        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
-        inputs.pop("image_embeds")
-        inputs.pop("negative_image_embeds")
-        return inputs
-
-    def test_kandinsky(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.4353, 0.4710, 0.5128, 0.4806, 0.5054, 0.5348, 0.5224, 0.4603, 0.5025])
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    @require_torch_gpu
-    def test_offloads(self):
-        pipes = []
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components).to(torch_device)
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
-        pipes.append(sd_pipe)
-
-        image_slices = []
-        for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
-            image = pipe(**inputs).images
-
-            image_slices.append(image[0, -3:, -3:, -1].flatten())
-
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
-        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=1e-1)
-
-    def test_dict_tuple_outputs_equivalent(self):
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
-
-    def test_model_cpu_offload_forward_pass(self):
-        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
-
-
-class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22InpaintCombinedPipeline
-    params = ["prompt", "image", "mask_image"]
-    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        dummy = InpaintDummies()
-        prior_dummy = PriorDummies()
-        components = dummy.get_dummy_components()
-
-        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        prior_dummy = PriorDummies()
-        dummy = InpaintDummies()
-        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
-        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
-        inputs.pop("image_embeds")
-        inputs.pop("negative_image_embeds")
-        return inputs
-
-    def test_kandinsky(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.5039, 0.4926, 0.4898, 0.4978, 0.4838, 0.4942, 0.4738, 0.4702, 0.4816])
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    @require_torch_gpu
-    def test_offloads(self):
-        pipes = []
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components).to(torch_device)
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
-        pipes.append(sd_pipe)
-
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
-        pipes.append(sd_pipe)
-
-        image_slices = []
-        for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
-            image = pipe(**inputs).images
-
-            image_slices.append(image[0, -3:, -3:, -1].flatten())
-
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
-        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=5e-1)
-
-    def test_dict_tuple_outputs_equivalent(self):
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
-
-    def test_model_cpu_offload_forward_pass(self):
-        super().test_model_cpu_offload_forward_pass(expected_max_diff=5e-4)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
deleted file mode 100644
index cec209c7cfec..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import (
-    DDIMScheduler,
-    KandinskyV22ControlnetPipeline,
-    KandinskyV22PriorPipeline,
-    UNet2DConditionModel,
-    VQModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    load_numpy,
-    nightly,
-    require_torch_gpu,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-enable_full_determinism()
-
-
-class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22ControlnetPipeline
-    params = ["image_embeds", "negative_image_embeds", "hint"]
-    batch_params = ["image_embeds", "negative_image_embeds", "hint"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 8,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "addition_embed_type": "image_hint",
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "encoder_hid_dim": self.text_embedder_hidden_size,
-            "encoder_hid_dim_type": "image_proj",
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None,
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 32, 64, 64],
-            "down_block_types": [
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "AttnDownEncoderBlock2D",
-            ],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        unet = self.dummy_unet
-        movq = self.dummy_movq
-
-        scheduler = DDIMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-            prediction_type="epsilon",
-            thresholding=False,
-        )
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
-            device
-        )
-
-        # create hint
-        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "hint": hint,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "guidance_scale": 4.0,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_kandinsky_controlnet(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.6959826, 0.868279, 0.7558092, 0.68769467, 0.85805804, 0.65977496, 0.44885302, 0.5959111, 0.4251595]
-        )
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=1e-1)
-
-
-@nightly
-@require_torch_gpu
-class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_controlnet(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy"
-        )
-
-        hint = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/hint_image_cat.png"
-        )
-        hint = torch.from_numpy(np.array(hint)).float() / 255.0
-        hint = hint.permute(2, 0, 1).unsqueeze(0)
-
-        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
-        )
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        prompt = "A robot, 4k photo"
-
-        generator = torch.Generator(device="cuda").manual_seed(0)
-        image_emb, zero_image_emb = pipe_prior(
-            prompt,
-            generator=generator,
-            num_inference_steps=5,
-            negative_prompt="",
-        ).to_tuple()
-
-        generator = torch.Generator(device="cuda").manual_seed(0)
-        output = pipeline(
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            hint=hint,
-            generator=generator,
-            num_inference_steps=100,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
deleted file mode 100644
index 0c7b99580085..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-
-from diffusers import (
-    DDIMScheduler,
-    KandinskyV22ControlnetImg2ImgPipeline,
-    KandinskyV22PriorEmb2EmbPipeline,
-    UNet2DConditionModel,
-    VQModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    load_numpy,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-enable_full_determinism()
-
-
-class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22ControlnetImg2ImgPipeline
-    params = ["image_embeds", "negative_image_embeds", "image", "hint"]
-    batch_params = ["image_embeds", "negative_image_embeds", "image", "hint"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "strength",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 8,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "addition_embed_type": "image_hint",
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "encoder_hid_dim": self.text_embedder_hidden_size,
-            "encoder_hid_dim_type": "image_proj",
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None,
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 32, 64, 64],
-            "down_block_types": [
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "AttnDownEncoderBlock2D",
-            ],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        unet = self.dummy_unet
-        movq = self.dummy_movq
-
-        ddim_config = {
-            "num_train_timesteps": 1000,
-            "beta_schedule": "linear",
-            "beta_start": 0.00085,
-            "beta_end": 0.012,
-            "clip_sample": False,
-            "set_alpha_to_one": False,
-            "steps_offset": 0,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-        }
-
-        scheduler = DDIMScheduler(**ddim_config)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
-            device
-        )
-        # create init_image
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-        # create hint
-        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image": init_image,
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "hint": hint,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "num_inference_steps": 10,
-            "guidance_scale": 7.0,
-            "strength": 0.2,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_kandinsky_controlnet_img2img(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.54985034, 0.55509365, 0.52561504, 0.5570494, 0.5593818, 0.5263979, 0.50285643, 0.5069846, 0.51196736]
-        )
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=1.75e-3)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=2e-1)
-
-
-@slow
-@require_torch_gpu
-class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_controlnet_img2img(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/kandinskyv22_controlnet_img2img_robotcat_fp16.npy"
-        )
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
-        )
-        init_image = init_image.resize((512, 512))
-
-        hint = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/hint_image_cat.png"
-        )
-        hint = torch.from_numpy(np.array(hint)).float() / 255.0
-        hint = hint.permute(2, 0, 1).unsqueeze(0)
-
-        prompt = "A robot, 4k photo"
-
-        pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
-        )
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-
-        image_emb, zero_image_emb = pipe_prior(
-            prompt,
-            image=init_image,
-            strength=0.85,
-            generator=generator,
-            negative_prompt="",
-        ).to_tuple()
-
-        output = pipeline(
-            image=init_image,
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            hint=hint,
-            generator=generator,
-            num_inference_steps=100,
-            height=512,
-            width=512,
-            strength=0.5,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
deleted file mode 100644
index 9a5b596def58..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-
-from diffusers import (
-    DDIMScheduler,
-    KandinskyV22Img2ImgPipeline,
-    KandinskyV22PriorPipeline,
-    UNet2DConditionModel,
-    VQModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    load_numpy,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-enable_full_determinism()
-
-
-class Dummies:
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 32
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 4,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "addition_embed_type": "image",
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "encoder_hid_dim": self.text_embedder_hidden_size,
-            "encoder_hid_dim_type": "image_proj",
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None,
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 64],
-            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": [
-                "AttnUpDecoderBlock2D",
-                "UpDecoderBlock2D",
-            ],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        unet = self.dummy_unet
-        movq = self.dummy_movq
-
-        ddim_config = {
-            "num_train_timesteps": 1000,
-            "beta_schedule": "linear",
-            "beta_start": 0.00085,
-            "beta_end": 0.012,
-            "clip_sample": False,
-            "set_alpha_to_one": False,
-            "steps_offset": 0,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-        }
-
-        scheduler = DDIMScheduler(**ddim_config)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
-            device
-        )
-        # create init_image
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image": init_image,
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "num_inference_steps": 10,
-            "guidance_scale": 7.0,
-            "strength": 0.2,
-            "output_type": "np",
-        }
-        return inputs
-
-
-class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22Img2ImgPipeline
-    params = ["image_embeds", "negative_image_embeds", "image"]
-    batch_params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "strength",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        dummies = Dummies()
-        return dummies.get_dummy_components()
-
-    def get_dummy_inputs(self, device, seed=0):
-        dummies = Dummies()
-        return dummies.get_dummy_inputs(device=device, seed=seed)
-
-    def test_kandinsky_img2img(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.5712, 0.5443, 0.4725, 0.6195, 0.5184, 0.4651, 0.4473, 0.4590, 0.5016])
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=2e-1)
-
-
-@slow
-@require_torch_gpu
-class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_img2img(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/kandinskyv22_img2img_frog.npy"
-        )
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
-        )
-        prompt = "A red cartoon frog, 4k"
-
-        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
-        )
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image_emb, zero_image_emb = pipe_prior(
-            prompt,
-            generator=generator,
-            num_inference_steps=5,
-            negative_prompt="",
-        ).to_tuple()
-
-        output = pipeline(
-            image=init_image,
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            generator=generator,
-            num_inference_steps=100,
-            height=768,
-            width=768,
-            strength=0.2,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
deleted file mode 100644
index f40ec0d1f070..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-
-from diffusers import (
-    DDIMScheduler,
-    KandinskyV22InpaintPipeline,
-    KandinskyV22PriorPipeline,
-    UNet2DConditionModel,
-    VQModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    load_numpy,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-enable_full_determinism()
-
-
-class Dummies:
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 32
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 9,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "addition_embed_type": "image",
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "encoder_hid_dim": self.text_embedder_hidden_size,
-            "encoder_hid_dim_type": "image_proj",
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None,
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 64],
-            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": [
-                "AttnUpDecoderBlock2D",
-                "UpDecoderBlock2D",
-            ],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        unet = self.dummy_unet
-        movq = self.dummy_movq
-
-        scheduler = DDIMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-            prediction_type="epsilon",
-            thresholding=False,
-        )
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
-            device
-        )
-        # create init_image
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-        # create mask
-        mask = np.zeros((64, 64), dtype=np.float32)
-        mask[:32, :32] = 1
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image": init_image,
-            "mask_image": mask,
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "num_inference_steps": 2,
-            "guidance_scale": 4.0,
-            "output_type": "np",
-        }
-        return inputs
-
-
-class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22InpaintPipeline
-    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
-    batch_params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-        "mask_image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        dummies = Dummies()
-        return dummies.get_dummy_components()
-
-    def get_dummy_inputs(self, device, seed=0):
-        dummies = Dummies()
-        return dummies.get_dummy_inputs(device=device, seed=seed)
-
-    def test_kandinsky_inpaint(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.50775903, 0.49527195, 0.48824543, 0.50192237, 0.48644906, 0.49373814, 0.4780598, 0.47234827, 0.48327848]
-        )
-
-        assert (
-            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert (
-            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=5e-1)
-
-    def test_model_cpu_offload_forward_pass(self):
-        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
-
-    def test_save_load_optional_components(self):
-        super().test_save_load_optional_components(expected_max_difference=5e-4)
-
-    def test_sequential_cpu_offload_forward_pass(self):
-        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
-
-
-@slow
-@require_torch_gpu
-class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_inpaint(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinskyv22/kandinskyv22_inpaint_cat_with_hat_fp16.npy"
-        )
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
-        )
-        mask = np.zeros((768, 768), dtype=np.float32)
-        mask[:250, 250:-250] = 1
-
-        prompt = "a hat"
-
-        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
-        )
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyV22InpaintPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image_emb, zero_image_emb = pipe_prior(
-            prompt,
-            generator=generator,
-            num_inference_steps=5,
-            negative_prompt="",
-        ).to_tuple()
-
-        output = pipeline(
-            image=init_image,
-            mask_image=mask,
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            generator=generator,
-            num_inference_steps=100,
-            height=768,
-            width=768,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
deleted file mode 100644
index a0de5cceeb75..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from torch import nn
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-
-from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
-
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-class Dummies:
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_prior(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "embedding_dim": self.text_embedder_hidden_size,
-            "num_layers": 1,
-        }
-
-        model = PriorTransformer(**model_kwargs)
-        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
-        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
-        return model
-
-    @property
-    def dummy_image_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=self.text_embedder_hidden_size,
-            image_size=224,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            num_attention_heads=4,
-            num_channels=3,
-            num_hidden_layers=5,
-            patch_size=14,
-        )
-
-        model = CLIPVisionModelWithProjection(config)
-        return model
-
-    @property
-    def dummy_image_processor(self):
-        image_processor = CLIPImageProcessor(
-            crop_size=224,
-            do_center_crop=True,
-            do_normalize=True,
-            do_resize=True,
-            image_mean=[0.48145466, 0.4578275, 0.40821073],
-            image_std=[0.26862954, 0.26130258, 0.27577711],
-            resample=3,
-            size=224,
-        )
-
-        return image_processor
-
-    def get_dummy_components(self):
-        prior = self.dummy_prior
-        image_encoder = self.dummy_image_encoder
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        image_processor = self.dummy_image_processor
-
-        scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="sample",
-            num_train_timesteps=1000,
-            clip_sample=True,
-            clip_sample_range=10.0,
-        )
-
-        components = {
-            "prior": prior,
-            "image_encoder": image_encoder,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "scheduler": scheduler,
-            "image_processor": image_processor,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "horse",
-            "generator": generator,
-            "guidance_scale": 4.0,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
-
-class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22PriorPipeline
-    params = ["prompt"]
-    batch_params = ["prompt", "negative_prompt"]
-    required_optional_params = [
-        "num_images_per_prompt",
-        "generator",
-        "num_inference_steps",
-        "latents",
-        "negative_prompt",
-        "guidance_scale",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        dummies = Dummies()
-        return dummies.get_dummy_components()
-
-    def get_dummy_inputs(self, device, seed=0):
-        dummies = Dummies()
-        return dummies.get_dummy_inputs(device=device, seed=seed)
-
-    def test_kandinsky_prior(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.image_embeds
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -10:]
-        image_from_tuple_slice = image_from_tuple[0, -10:]
-
-        assert image.shape == (1, 32)
-
-        expected_slice = np.array(
-            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device == "cpu"
-        test_mean_pixel_difference = False
-
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
deleted file mode 100644
index 89b603e9fc1d..000000000000
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from torch import nn
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-
-from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device
-
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyV22PriorEmb2EmbPipeline
-    params = ["prompt", "image"]
-    batch_params = ["prompt", "image"]
-    required_optional_params = [
-        "num_images_per_prompt",
-        "strength",
-        "generator",
-        "num_inference_steps",
-        "negative_prompt",
-        "guidance_scale",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_prior(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "embedding_dim": self.text_embedder_hidden_size,
-            "num_layers": 1,
-        }
-
-        model = PriorTransformer(**model_kwargs)
-        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
-        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
-        return model
-
-    @property
-    def dummy_image_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=self.text_embedder_hidden_size,
-            image_size=224,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            num_attention_heads=4,
-            num_channels=3,
-            num_hidden_layers=5,
-            patch_size=14,
-        )
-
-        model = CLIPVisionModelWithProjection(config)
-        return model
-
-    @property
-    def dummy_image_processor(self):
-        image_processor = CLIPImageProcessor(
-            crop_size=224,
-            do_center_crop=True,
-            do_normalize=True,
-            do_resize=True,
-            image_mean=[0.48145466, 0.4578275, 0.40821073],
-            image_std=[0.26862954, 0.26130258, 0.27577711],
-            resample=3,
-            size=224,
-        )
-
-        return image_processor
-
-    def get_dummy_components(self):
-        prior = self.dummy_prior
-        image_encoder = self.dummy_image_encoder
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        image_processor = self.dummy_image_processor
-
-        scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="sample",
-            num_train_timesteps=1000,
-            clip_sample=True,
-            clip_sample_range=10.0,
-        )
-
-        components = {
-            "prior": prior,
-            "image_encoder": image_encoder,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "scheduler": scheduler,
-            "image_processor": image_processor,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-
-        inputs = {
-            "prompt": "horse",
-            "image": init_image,
-            "strength": 0.5,
-            "generator": generator,
-            "guidance_scale": 4.0,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_kandinsky_prior_emb2emb(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.image_embeds
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -10:]
-        image_from_tuple_slice = image_from_tuple[0, -10:]
-
-        assert image.shape == (1, 32)
-
-        expected_slice = np.array(
-            [
-                0.1071284,
-                1.3330271,
-                0.61260223,
-                -0.6691065,
-                -0.3846852,
-                -1.0303661,
-                0.22716111,
-                0.03348901,
-                0.30040675,
-                -0.24805029,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device == "cpu"
-        test_mean_pixel_difference = False
-
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
diff --git a/tests/pipelines/text_to_video/__init__.py b/tests/pipelines/text_to_video/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py
deleted file mode 100644
index 2c47dc492da1..000000000000
--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    TextToVideoSDPipeline,
-    UNet3DConditionModel,
-)
-from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    load_numpy,
-    require_torch_gpu,
-    skip_mps,
-    slow,
-    torch_device,
-)
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-@skip_mps
-class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = TextToVideoSDPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    # No `output_type`.
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet3DConditionModel(
-            block_out_channels=(32, 32),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=4,
-            attention_head_dim=4,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=(32,),
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=32,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=4,
-            intermediate_size=16,
-            layer_norm_eps=1e-05,
-            num_attention_heads=2,
-            num_hidden_layers=2,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_text_to_video_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = TextToVideoSDPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["output_type"] = "np"
-        frames = sd_pipe(**inputs).frames
-        image_slice = frames[0][-3:, -3:, -1]
-
-        assert frames[0].shape == (32, 32, 3)
-        expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
-    def test_num_images_per_prompt(self):
-        pass
-
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@slow
-@skip_mps
-@require_torch_gpu
-class TextToVideoSDPipelineSlowTests(unittest.TestCase):
-    def test_two_step_model(self):
-        expected_video = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
-        )
-
-        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-        pipe = pipe.to(torch_device)
-
-        prompt = "Spiderman is surfing"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-
-        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames
-        video = video_frames.cpu().numpy()
-
-        assert np.abs(expected_video - video).mean() < 5e-2
diff --git a/tests/pipelines/text_to_video/test_text_to_video_zero.py b/tests/pipelines/text_to_video/test_text_to_video_zero.py
deleted file mode 100644
index 02fb43a0b65b..000000000000
--- a/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import DDIMScheduler, TextToVideoZeroPipeline
-from diffusers.utils.testing_utils import load_pt, require_torch_gpu, slow
-
-from ..test_pipelines_common import assert_mean_pixel_difference
-
-
-@slow
-@require_torch_gpu
-class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
-    def test_full_model(self):
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        generator = torch.Generator(device="cuda").manual_seed(0)
-
-        prompt = "A bear is playing a guitar on Times Square"
-        result = pipe(prompt=prompt, generator=generator).images
-
-        expected_result = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
-        )
-
-        assert_mean_pixel_difference(result, expected_result)
diff --git a/tests/pipelines/text_to_video/test_video_to_video.py b/tests/pipelines/text_to_video/test_video_to_video.py
deleted file mode 100644
index f057eb34997e..000000000000
--- a/tests/pipelines/text_to_video/test_video_to_video.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    UNet3DConditionModel,
-    VideoToVideoSDPipeline,
-)
-from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    skip_mps,
-    slow,
-    torch_device,
-)
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-@skip_mps
-class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = VideoToVideoSDPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    test_attention_slicing = False
-
-    # No `output_type`.
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet3DConditionModel(
-            block_out_channels=(32, 64, 64, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=32,
-            attention_head_dim=4,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=True,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        # 3 frames
-        video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device)
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "video": video,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_text_to_video_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = VideoToVideoSDPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["output_type"] = "np"
-        frames = sd_pipe(**inputs).frames
-        image_slice = frames[0][-3:, -3:, -1]
-
-        assert frames[0].shape == (32, 32, 3)
-        expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_save_load_optional_components(self):
-        super().test_save_load_optional_components(expected_max_difference=0.001)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3)
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
-    def test_num_images_per_prompt(self):
-        pass
-
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@slow
-@skip_mps
-class VideoToVideoSDPipelineSlowTests(unittest.TestCase):
-    def test_two_step_model(self):
-        pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
-
-        # 10 frames
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        video = torch.randn((1, 10, 3, 1024, 576), generator=generator)
-        video = video.to("cuda")
-
-        prompt = "Spiderman is surfing"
-
-        video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="pt").frames
-
-        expected_array = np.array([-1.0458984, -1.1279297, -0.9663086, -0.91503906, -0.75097656])
-        assert np.abs(video_frames.cpu().numpy()[0, 0, 0, 0, -5:] - expected_array).sum() < 1e-2

From 123bb9d4e99f527e8b0e986abf8a3d7d0673f30b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 21 Sep 2023 23:57:09 +0200
Subject: [PATCH 08/22] test actions in pr

---
 .github/workflows/push_tests.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 3f816bca7285..cbbdc6e6a8e1 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,9 +1,13 @@
 name: Slow Tests on main
 
 on:
-  push:
+  pull_request:
     branches:
       - main
+  push:
+    branches:
+      - ci-*
+
 
 env:
   DIFFUSERS_IS_CI: yes

From b9ff251ef778052f21972cdb83a4c464abefbec8 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 15:53:29 +0530
Subject: [PATCH 09/22] change runner to gpu

---
 .github/workflows/push_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index cbbdc6e6a8e1..bff3ac3bf11e 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -20,7 +20,7 @@ env:
 
 jobs:
   setup_torch_cuda_pipeline_matrix:
-    runs-on: docker-cpu
+    runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

From 6d839073239c3938df6e13d0814b1a708bd27567 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 10:29:35 +0000
Subject: [PATCH 10/22] clean up

---
 .github/workflows/push_tests.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index bff3ac3bf11e..f1663a60d5b1 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -63,11 +63,9 @@ jobs:
       matrix:
         module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
     runs-on: docker-gpu
-    framework: pytorch
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
-
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3
@@ -107,7 +105,6 @@ jobs:
 
   torch_cuda_tests:
     runs-on: docker-gpu
-    framework: pytorch
     report: torch_cuda
     container:
       image: diffusers/diffusers-onnxruntime-cuda
@@ -155,7 +152,6 @@ jobs:
 
   flax_tpu_tests:
     runs-on: docker-tpu
-    framework: flax
     report: flax_tpu
     container:
       image: diffusers/diffusers-flax-tpu
@@ -180,7 +176,6 @@ jobs:
         python utils/print_env.py
 
     - name: Run slow Flax TPU tests
-      if: ${{ framework == 'flax' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
@@ -202,7 +197,6 @@ jobs:
 
   onnx_cuda_tests:
     runs-on: docker-gpu
-    framework: onnxruntime
     report: onnx_cuda
     container:
       image: diffusers/diffusers-onnxruntime-cuda

From 9983c6337a6d58ca5998b4ae8433805b483541ca Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 10:45:04 +0000
Subject: [PATCH 11/22] clean up

---
 .github/workflows/push_tests.yml | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index f1663a60d5b1..5efeab75c381 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -90,7 +90,7 @@ jobs:
         run: |
           python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
             -s -v -k "not Flax and not Onnx" \
-            --make-reports=tests_${{ matrix.module }}_cuda \
+            --make-reports=pipeline_${{ matrix.module }}_cuda \
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
         if: ${{ failure() }}
@@ -100,12 +100,11 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v2
         with:
-          name: ${{ matrix.module }}_test_reports
+          name: pipeline_${{ matrix.module }}_test_reports
           path: reports
 
   torch_cuda_tests:
     runs-on: docker-gpu
-    report: torch_cuda
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -136,18 +135,18 @@ jobs:
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_cuda \
+          --make-reports=torch_cuda \
           tests/models tests/schedulers tests/others
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_${{ report }}_failures_short.txt
+      run: cat reports/tests_torch_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: ${{ report }}_test_reports
+        name: torch_cuda_test_reports
         path: reports
 
   flax_tpu_tests:
@@ -181,7 +180,7 @@ jobs:
       run: |
         python -m pytest -n 0 \
           -s -v -k "Flax" \
-          --make-reports=tests_${{ report }} \
+          --make-reports=flax_tpu \
           tests/
 
     - name: Failure short reports
@@ -192,12 +191,11 @@ jobs:
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: ${{ report }}_test_reports
+        name: flax_tpu_test_reports
         path: reports
 
   onnx_cuda_tests:
     runs-on: docker-gpu
-    report: onnx_cuda
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -226,18 +224,18 @@ jobs:
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
-          --make-reports=tests_${{ report }} \
+          --make-reports=onnx_cuda \
           tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_${{ report }}_failures_short.txt
+      run: cat reports/tests_onnx_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: ${{ report }}_test_reports
+        name: onnx_cuda_test_reports
         path: reports
 
   run_examples_tests:
@@ -281,5 +279,5 @@ jobs:
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: examples_test_reports
+        name: examples_torch_cuda_test_reports
         path: reports

From 31da7866c64bc6198a23e5c585957e5539cc63ca Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 10:48:12 +0000
Subject: [PATCH 12/22] clean up

---
 .github/workflows/push_tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 5efeab75c381..356e690a6dce 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -151,7 +151,6 @@ jobs:
 
   flax_tpu_tests:
     runs-on: docker-tpu
-    report: flax_tpu
     container:
       image: diffusers/diffusers-flax-tpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged

From c220f2046a720e6f94e7326583fdc2f1c607eef0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 10:52:28 +0000
Subject: [PATCH 13/22] fix report

---
 .github/workflows/push_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 356e690a6dce..310e325c6232 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -94,7 +94,7 @@ jobs:
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
         if: ${{ failure() }}
-        run: cat reports/tests_${{ matrix.module }}_failures_short.txt
+        run: cat reports/tests_pipeline_${{ matrix.module }}_failures_short.txt
 
       - name: Test suite reports artifacts
         if: ${{ always() }}
@@ -184,7 +184,7 @@ jobs:
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_${{ report }}_failures_short.txt
+      run: cat reports/tests_flax_tpu_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}

From 9b7eb4ce867e667f8f659ab58a449affcb472e1e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 11:39:49 +0000
Subject: [PATCH 14/22] fix reporting

---
 .github/workflows/push_tests.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 310e325c6232..319aaa187c60 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -90,7 +90,7 @@ jobs:
         run: |
           python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
             -s -v -k "not Flax and not Onnx" \
-            --make-reports=pipeline_${{ matrix.module }}_cuda \
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
         if: ${{ failure() }}
@@ -106,7 +106,7 @@ jobs:
   torch_cuda_tests:
     runs-on: docker-gpu
     container:
-      image: diffusers/diffusers-onnxruntime-cuda
+      image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
     defaults:
       run:
@@ -135,7 +135,7 @@ jobs:
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
-          --make-reports=torch_cuda \
+          --make-reports=tests_torch_cuda \
           tests/models tests/schedulers tests/others
 
     - name: Failure short reports
@@ -179,7 +179,7 @@ jobs:
       run: |
         python -m pytest -n 0 \
           -s -v -k "Flax" \
-          --make-reports=flax_tpu \
+          --make-reports=tests_flax_tpu \
           tests/
 
     - name: Failure short reports
@@ -223,7 +223,7 @@ jobs:
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
-          --make-reports=onnx_cuda \
+          --make-reports=tests_onnx_cuda \
           tests/
 
     - name: Failure short reports

From b9b51a744d1e16a786a7bd7e34e87c5137332966 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 11:43:42 +0000
Subject: [PATCH 15/22] clean up

---
 .github/workflows/push_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 319aaa187c60..426b5930a4ac 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -22,7 +22,7 @@ jobs:
   setup_torch_cuda_pipeline_matrix:
     runs-on: docker-gpu
     container:
-      image: diffusers/diffusers-pytorch-cpu
+      image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}

From b8521609cec7adb3ab3a49b0dd129fe0c2f93efa Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 12:20:59 +0000
Subject: [PATCH 16/22] show test stats in failure reports

---
 .github/workflows/push_tests.yml | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 426b5930a4ac..d432ee6ade27 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --shm-size "16gb" --ipc host
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
     steps:
@@ -94,7 +94,9 @@ jobs:
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
         if: ${{ failure() }}
-        run: cat reports/tests_pipeline_${{ matrix.module }}_failures_short.txt
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
 
       - name: Test suite reports artifacts
         if: ${{ always() }}
@@ -140,7 +142,9 @@ jobs:
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_torch_cuda_failures_short.txt
+      run: |
+        cat reports/tests_torch_cuda_stats.txt
+        cat reports/tests_torch_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
@@ -184,7 +188,9 @@ jobs:
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_flax_tpu_failures_short.txt
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
@@ -228,7 +234,9 @@ jobs:
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_onnx_cuda_failures_short.txt
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
@@ -272,11 +280,13 @@ jobs:
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/examples_torch_cuda_failures_short.txt
+      run: |
+        cat reports/examples_torch_cuda_stats.txt
+        cat reports/examples_torch_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: examples_torch_cuda_test_reports
-        path: reports
+        name: examples_test_reports
+        path: reports
\ No newline at end of file

From 50b88f38f1069b908454a71fac989fd69464c07c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 22 Sep 2023 12:51:19 +0000
Subject: [PATCH 17/22] give names to jobs

---
 .github/workflows/push_tests.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index d432ee6ade27..0157131867a6 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -20,6 +20,7 @@ env:
 
 jobs:
   setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines CUDA Slow Tests Matrix
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
@@ -56,6 +57,7 @@ jobs:
           path: reports
 
   torch_pipelines_cuda_tests:
+    name: Torch Pipelines CUDA Slow Tests
     needs: setup_torch_cuda_pipeline_matrix
     strategy:
       fail-fast: false
@@ -106,6 +108,7 @@ jobs:
           path: reports
 
   torch_cuda_tests:
+    name: Torch CUDA Tests
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cuda
@@ -154,6 +157,7 @@ jobs:
         path: reports
 
   flax_tpu_tests:
+    name: Flax TPU Tests
     runs-on: docker-tpu
     container:
       image: diffusers/diffusers-flax-tpu
@@ -200,6 +204,7 @@ jobs:
         path: reports
 
   onnx_cuda_tests:
+    name: ONNX CUDA Tests
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-onnxruntime-cuda

From 155a87678fdbaae067a89d1f7bed203a2b629b18 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 25 Sep 2023 10:52:09 +0000
Subject: [PATCH 18/22] add lora tests

---
 .github/workflows/push_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 0157131867a6..40a5827cf653 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -141,7 +141,7 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_cuda \
-          tests/models tests/schedulers tests/others
+          tests/models tests/schedulers tests/lora tests/others
 
     - name: Failure short reports
       if: ${{ failure() }}

From 2bff53848767cc2381971df330993f36f788adf6 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 26 Sep 2023 10:27:16 +0000
Subject: [PATCH 19/22] split torch cuda tests and add compile tests

---
 .github/workflows/push_tests.yml | 45 +++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 40a5827cf653..0c277c32fa38 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -116,6 +116,9 @@ jobs:
     defaults:
       run:
         shell: bash
+    strategy:
+      matrix:
+        module: [models, schedulers, lora, others]
     steps:
     - name: Checkout diffusers
       uses: actions/checkout@v3
@@ -141,7 +144,7 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_cuda \
-          tests/models tests/schedulers tests/lora tests/others
+          tests/${{ matrix.module }}
 
     - name: Failure short reports
       if: ${{ failure() }}
@@ -250,6 +253,46 @@ jobs:
         name: onnx_cuda_test_reports
         path: reports
 
+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on: docker-gpu
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+    - name: Install dependencies
+      run: |
+        python -m pip install -e .[quality,test,training]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+    - name: Run example tests on GPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 

From fabeeaaba36a401ca21c962e5a0b2dfbeff5327a Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 28 Sep 2023 13:32:19 +0000
Subject: [PATCH 20/22] clean up

---
 .github/workflows/push_tests.yml | 44 --------------------------------
 1 file changed, 44 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 86b1c422b115..0c277c32fa38 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -293,50 +293,6 @@ jobs:
         name: torch_compile_test_reports
         path: reports
 
-  run_torch_compile_tests:
-    name: PyTorch Compile CUDA tests
-
-    runs-on: docker-gpu
-
-    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-
-    - name: Install dependencies
-      run: |
-        python -m pip install -e .[quality,test,training]
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run example tests on GPU
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_torch_compile_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: torch_compile_test_reports
-        path: reports
-
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 

From d18455faa43e67168e431ca1fc8edaba9fa323aa Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 29 Sep 2023 09:24:11 +0000
Subject: [PATCH 21/22] fix tests

---
 .../controlnet/pipeline_controlnet_blip_diffusion.py      | 4 ++--
 .../stable_diffusion/pipeline_stable_diffusion_upscale.py | 3 ++-
 tests/lora/test_lora_layers_old_backend.py                | 2 +-
 tests/pipelines/controlnet/test_controlnet_inpaint.py     | 3 ++-
 tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py | 3 +++
 tests/pipelines/test_pipelines_flax.py                    | 8 ++++----
 6 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
index e10a8624f068..58f003960e99 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
@@ -213,7 +213,7 @@ def prepare_control_image(
             do_center_crop=False,
             do_normalize=False,
             return_tensors="pt",
-        )["pixel_values"].to(self.device)
+        )["pixel_values"].to(device)
         image_batch_size = image.shape[0]
 
         if image_batch_size == 1:
@@ -365,7 +365,7 @@ def __call__(
             height=height,
             batch_size=batch_size,
             num_images_per_prompt=1,
-            device=self.device,
+            device=device,
             dtype=self.controlnet.dtype,
             do_classifier_free_guidance=do_classifier_free_guidance,
         )
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index f333de74990d..d791da2ea3bc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -757,8 +757,9 @@ def __call__(
 
             if needs_upcasting:
                 self.upcast_vae()
-                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
+            # Ensure latents are always the same type as the VAE
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
 
             # cast back to fp16 if needed
diff --git a/tests/lora/test_lora_layers_old_backend.py b/tests/lora/test_lora_layers_old_backend.py
index ae90f8b6a4b8..d616ef8c78b8 100644
--- a/tests/lora/test_lora_layers_old_backend.py
+++ b/tests/lora/test_lora_layers_old_backend.py
@@ -1554,7 +1554,7 @@ def test_lora_on_off(self, expected_max_diff=1e-3):
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
     )
-    def test_lora_xformers_on_off(self, expected_max_diff=1e-4):
+    def test_lora_xformers_on_off(self, expected_max_diff=6e-4):
         # enable deterministic behavior for gradient checkpointing
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
index 1ec1f493b9f0..a9140f3d5a31 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -39,6 +39,7 @@
     enable_full_determinism,
     floats_tensor,
     load_numpy,
+    numpy_cosine_similarity_distance,
     require_torch_gpu,
     slow,
     torch_device,
@@ -550,7 +551,7 @@ def make_inpaint_condition(image, image_mask):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/boy_ray_ban.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 0.9e-1
+        assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 1e-2
 
     def test_load_local(self):
         controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
index cec209c7cfec..74a912faa33f 100644
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py
@@ -221,6 +221,9 @@ def test_kandinsky_controlnet(self):
     def test_float16_inference(self):
         super().test_float16_inference(expected_max_diff=1e-1)
 
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
 
 @nightly
 @require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_flax.py b/tests/pipelines/test_pipelines_flax.py
index 294dad5ff0f1..fa2283d7a6b9 100644
--- a/tests/pipelines/test_pipelines_flax.py
+++ b/tests/pipelines/test_pipelines_flax.py
@@ -110,7 +110,7 @@ def test_stable_diffusion_v1_4(self):
 
         assert images.shape == (num_samples, 1, 512, 512, 3)
         if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-2
             assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16(self):
@@ -139,7 +139,7 @@ def test_stable_diffusion_v1_4_bfloat_16(self):
 
         assert images.shape == (num_samples, 1, 512, 512, 3)
         if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 1e-3
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
             assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
@@ -168,7 +168,7 @@ def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
 
         assert images.shape == (num_samples, 1, 512, 512, 3)
         if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 1e-3
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
             assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
@@ -212,7 +212,7 @@ def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
 
         assert images.shape == (num_samples, 1, 512, 512, 3)
         if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 5e-2
             assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1
 
     def test_jax_memory_efficient_attention(self):

From a9a5f0e2873df61f26bb244cd6460eb273efa211 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 2 Oct 2023 18:13:15 +0200
Subject: [PATCH 22/22] change push to run only on main

---
 .github/workflows/push_tests.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 0c277c32fa38..a15a5412c4e4 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,12 +1,9 @@
 name: Slow Tests on main
 
 on:
-  pull_request:
-    branches:
-      - main
   push:
     branches:
-      - ci-*
+      - main
 
 
 env: