From c40e2864fe7c63a105c5d105094c47cbe87483fb Mon Sep 17 00:00:00 2001
From: Shahmatov Arseniy <62886550+cene555@users.noreply.github.com>
Date: Thu, 29 Jun 2023 23:20:31 +0300
Subject: [PATCH 01/51] Kandinsky2_2

---
 src/diffusers/models/embeddings.py            |  61 ++
 src/diffusers/models/unet_2d_condition.py     |  50 +-
 .../pipelines/kandinsky2_2/__init__.py        |   8 +
 .../pipeline_kandinsky2_2_decoder.py          | 320 +++++++++++
 ...ipeline_kandinsky2_2_decoder_controlnet.py | 308 ++++++++++
 ...kandinsky2_2_decoder_controlnet_img2img.py | 385 +++++++++++++
 .../pipeline_kandinsky2_2_decoder_img2img.py  | 382 ++++++++++++
 ...ipeline_kandinsky2_2_decoder_inpainting.py | 487 ++++++++++++++++
 .../pipeline_kandinsky2_2_prior.py            | 542 ++++++++++++++++++
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 496 ++++++++++++++++
 10 files changed, 3036 insertions(+), 3 deletions(-)
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/__init__.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
 create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 4dd16f0dd5ff..348f37d9190f 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -375,6 +375,27 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso
 
         return torch.cat([image_text_embeds, text_embeds], dim=1)
 
+class ImageProjection(nn.Module):
+    def __init__(
+        self,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 32,
+    ):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds: torch.FloatTensor):
+        batch_size = image_embeds.shape[0]
+
+        # image
+        image_embeds = self.image_embeds(image_embeds)
+        image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+        image_embeds = self.norm(image_embeds)
+        return image_embeds
+
 
 class CombinedTimestepLabelEmbeddings(nn.Module):
     def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
@@ -428,6 +449,46 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso
 
         return time_image_embeds + time_text_embeds
 
+class ImageTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+
+    def forward(self,image_embeds: torch.FloatTensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        return time_image_embeds
+
+class ImageHintTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+        self.input_hint_block = nn.Sequential(
+                    nn.Conv2d(3, 16, 3, padding=1),
+                    nn.SiLU(),
+                    nn.Conv2d(16, 16, 3, padding=1),
+                    nn.SiLU(),
+                    nn.Conv2d(16, 32, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    nn.Conv2d(32, 32, 3, padding=1),
+                    nn.SiLU(),
+                    nn.Conv2d(32, 96, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    nn.Conv2d(96, 96, 3, padding=1),
+                    nn.SiLU(),
+                    nn.Conv2d(96, 256, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    nn.Conv2d(256, 4, 3, padding=1)
+                )
+    def forward(self,image_embeds: torch.FloatTensor, hint: torch.FloatTensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        hint = self.input_hint_block(hint)
+        return time_image_embeds, hint
 
 class AttentionPooling(nn.Module):
     # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 7bca5c336c57..fce6307d9be9 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -26,7 +26,10 @@
 from .embeddings import (
     GaussianFourierProjection,
     TextImageProjection,
+    ImageProjection,
     TextImageTimeEmbedding,
+    ImageTimeEmbedding,
+    ImageHintTimeEmbedding,
     TextTimeEmbedding,
     TimestepEmbedding,
     Timesteps,
@@ -296,7 +299,12 @@ def __init__(
                 image_embed_dim=cross_attention_dim,
                 cross_attention_dim=cross_attention_dim,
             )
-
+        elif encoder_hid_dim_type == "image_proj":
+            # Kadinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
         elif encoder_hid_dim_type is not None:
             raise ValueError(
                 f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
@@ -349,6 +357,16 @@ def __init__(
             self.add_embedding = TextImageTimeEmbedding(
                 text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
             )
+        elif addition_embed_type == "image":
+            #Kadinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "image_hint":
+            #Kadinsky 2.2
+            self.add_embedding = ImageHintTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            ) 
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
@@ -789,10 +807,29 @@ def forward(
 
             image_embs = added_cond_kwargs.get("image_embeds")
             text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-
             aug_emb = self.add_embedding(text_embs, image_embs)
             emb = emb + aug_emb
+        elif self.config.addition_embed_type == "image":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
 
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+            emb = emb + aug_emb
+        elif self.config.addition_embed_type == "image_hint":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            emb = emb + aug_emb
+            sample = torch.cat([sample, hint], dim=1)
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
@@ -807,7 +844,14 @@ def forward(
 
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
-
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
new file mode 100644
index 000000000000..3874862b3ecd
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -0,0 +1,8 @@
+from .pipeline_kandinsky2_2_decoder_controlnet_img2img import Kandinsky2_2_DecoderControlnetImg2ImgPipeline
+from .pipeline_kandinsky2_2_decoder_controlnet import Kandinsky2_2_DecoderControlnetPipeline
+from .pipeline_kandinsky2_2_decoder_img2img import Kandinsky2_2_DecoderImg2ImgPipeline
+from .pipeline_kandinsky2_2_decoder_inpainting import Kandinsky2_2_DecoderInpaintPipeline
+from .pipeline_kandinsky2_2_decoder import Kandinsky2_2_DecoderPipeline
+from .pipeline_kandinsky2_2_prior_emb2emb import Kandinsky2_2PriorEmb2EmbPipeline
+from .pipeline_kandinsky2_2_prior import Kandinsky2_2PriorPipeline
+
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
new file mode 100644
index 000000000000..08dffdb4e51e
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -0,0 +1,320 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import Kandinsky2_2_DecoderPipeline, Kandinsky2_2PriorPipeline
+        >>> import torch
+        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> zero_image_emb = out.negative_image_embeds
+        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+class Kandinsky2_2_DecoderPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        vae ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        vae: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.unet,
+            self.vae,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=self.unet.dtype, device=device
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
+                #             need to use DDPM scheduler instead
+                # prev_timestep=prev_timestep,
+                generator=generator,
+            )[0]
+        # post-processing
+        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
new file mode 100644
index 000000000000..eb20356626c1
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -0,0 +1,308 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+class Kandinsky2_2_DecoderControlnetPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        vae: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.unet,
+            self.vae,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        hint: torch.FloatTensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            hint (`torch.FloatTensor`):
+                The controlnet condition.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=self.unet.dtype, device=device
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.vae.config.latent_channels
+
+        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds,
+                                 "hint": hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
+                #             need to use DDPM scheduler instead
+                # prev_timestep=prev_timestep,
+                generator=generator,
+            )[0]
+        # post-processing
+        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
new file mode 100644
index 000000000000..18fbed543fb3
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -0,0 +1,385 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+import PIL
+from PIL import Image
+import numpy as np
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class Kandinsky2_2_DecoderControlnetImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        vae: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start        
+        
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        try:
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        except:
+            pass
+        latents = init_latents
+
+        return latents
+
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.unet,
+            self.vae,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        hint: torch.FloatTensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            hint (`torch.FloatTensor`):
+                The controlnet condition.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=self.unet.dtype, device=device
+        )
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        num_channels_latents = self.vae.config.latent_channels
+        
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.vae.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds, 'hint': hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            try:
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                )[0]
+            except:
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                )[0]
+        # post-processing
+        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
new file mode 100644
index 000000000000..023ce445a863
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -0,0 +1,382 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+import PIL
+from PIL import Image
+import numpy as np
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class Kandinsky2_2_DecoderImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        vae: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start        
+        
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        try:
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        except:
+            pass
+        latents = init_latents
+
+        return latents
+
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.unet,
+            self.vae,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=self.unet.dtype, device=device
+        )
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        num_channels_latents = self.unet.config.in_channels
+        
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.vae.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            try:
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                )[0]
+            except:
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                )[0]
+        # post-processing
+        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
new file mode 100644
index 000000000000..c62ce5d922fb
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -0,0 +1,487 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+import numpy as np
+import PIL 
+from PIL import Image
+import torch.nn.functional as F
+from copy import deepcopy
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    return mask, image
+
+class Kandinsky2_2_DecoderInpaintPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        vae: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.unet,
+            self.vae,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. Black pixels in the mask will be
+                repainted, while white pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=self.unet.dtype, device=device
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # preprocess image and mask
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
+
+        image = image.to(dtype=image_embeds.dtype, device=device)
+        image = self.vae.encode(image)["latents"]
+
+        mask_image = mask_image.to(dtype=image_embeds.dtype, device=device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image)
+        masked_image = image * mask_image
+
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        num_channels_latents = self.vae.config.latent_channels
+        
+        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        noise = torch.clone(latents)
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+            
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
+                #             need to use DDPM scheduler instead
+                # prev_timestep=prev_timestep,
+                generator=generator,
+                orig_sample=masked_image,
+                mask=mask_image
+            )[0]
+            init_latents_proper = image[:1]
+            init_mask = mask_image[:1]
+
+            if i < len(timesteps_tensor) - 1:
+                noise_timestep = timesteps_tensor[i + 1]
+                init_latents_proper = self.scheduler.add_noise(
+                    init_latents_proper, noise, torch.tensor([noise_timestep])
+                )
+
+            latents = init_mask * init_latents_proper + (1 - init_mask) * latents
+        # post-processing
+        print(mask_image)
+        latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
+        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
new file mode 100644
index 000000000000..4684d14bf8d3
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -0,0 +1,542 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...pipelines import DiffusionPipeline
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    BaseOutput,
+    is_accelerate_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import Kandinsky2_2_DecoderPipeline, Kandinsky2_2PriorPipeline
+        >>> import torch
+        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> zero_image_emb = out.negative_image_embeds
+        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import Kandinsky2_2PriorPipeline, Kandinsky2_2_DecoderPipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+        >>> import torch
+        >>> from torchvision import transforms
+        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> out = pipe_prior.interpolate(images_texts, weights)
+        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=out.image_embeds,
+        ...     negative_image_embeds=out.negative_image_embeds,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images[0]
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+    Args:
+        image_embeds (`torch.FloatTensor`)
+            clip image embeddings for text prompt
+        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+
+    image_embeds: Union[torch.FloatTensor, np.ndarray]
+    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
+
+
+class Kandinsky2_2PriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: Union[str] = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.image_encoder,
+            self.text_encoder,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
+            return self.device
+        for module in self.text_encoder.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        prior_timesteps_tensor = self.scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
new file mode 100644
index 000000000000..83a7343f4106
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -0,0 +1,496 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...pipelines import DiffusionPipeline
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    BaseOutput,
+    is_accelerate_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+    Args:
+        image_embeds (`torch.FloatTensor`)
+            clip image embeddings for text prompt
+        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+
+    image_embeds: Union[torch.FloatTensor, np.ndarray]
+    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
+
+
+class Kandinsky2_2PriorEmb2EmbPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start    
+        
+    @torch.no_grad()
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: Union[str] = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
+
+    def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+
+        emb = emb.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        init_latents = emb
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        try:
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        except:
+            print('error')
+        latents = init_latents
+
+        return latents
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.image_encoder,
+            self.text_encoder,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
+            return self.device
+        for module in self.text_encoder.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        emb: torch.FloatTensor,
+        strength: float = 0.3,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added.
+            emb (`torch.FloatTensor`):
+                The image embedding.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+        
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        embedding_dim = self.prior.config.embedding_dim
+        latents = emb.repeat_interleave(num_images_per_prompt, dim=0)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size // num_images_per_prompt, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        print(latents.shape, prompt_embeds.shape)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
\ No newline at end of file

From c082fdd55262e1d30363b4e869aa6c5cab20e013 Mon Sep 17 00:00:00 2001
From: Shahmatov Arseniy <62886550+cene555@users.noreply.github.com>
Date: Thu, 29 Jun 2023 23:31:10 +0300
Subject: [PATCH 02/51] fix init kandinsky2_2

---
 src/diffusers/__init__.py           | 7 +++++++
 src/diffusers/pipelines/__init__.py | 9 +++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 02907075345e..505af53e21d8 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -136,6 +136,13 @@
         KandinskyInpaintPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
+        Kandinsky2_2_DecoderControlnetImg2ImgPipeline,
+        Kandinsky2_2_DecoderControlnetPipeline,
+        Kandinsky2_2_DecoderImg2ImgPipeline,
+        Kandinsky2_2_DecoderPipeline,
+        Kandinsky2_2PriorEmb2EmbPipeline,
+        Kandinsky2_2PriorPipeline,
+        Kandinsky2_2_DecoderInpaintPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b1650240848a..be71f53e8415 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -63,6 +63,15 @@
         KandinskyPipeline,
         KandinskyPriorPipeline,
     )
+    from .kandinsky2_2 import (
+        Kandinsky2_2_DecoderControlnetImg2ImgPipeline,
+        Kandinsky2_2_DecoderControlnetPipeline,
+        Kandinsky2_2_DecoderImg2ImgPipeline,
+        Kandinsky2_2_DecoderPipeline,
+        Kandinsky2_2PriorEmb2EmbPipeline,
+        Kandinsky2_2PriorPipeline,
+        Kandinsky2_2_DecoderInpaintPipeline
+    )
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline

From 392cff0860ccbe3bfb65549c42d58841646cc048 Mon Sep 17 00:00:00 2001
From: Shahmatov Arseniy <62886550+cene555@users.noreply.github.com>
Date: Thu, 29 Jun 2023 23:38:39 +0300
Subject: [PATCH 03/51] kandinsky2_2 fix inpainting

---
 .../pipeline_kandinsky2_2_decoder_inpainting.py             | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index c62ce5d922fb..5f388e13603b 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -448,12 +448,7 @@ def __call__(
                 noise_pred,
                 t,
                 latents,
-                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
-                #             need to use DDPM scheduler instead
-                # prev_timestep=prev_timestep,
                 generator=generator,
-                orig_sample=masked_image,
-                mask=mask_image
             )[0]
             init_latents_proper = image[:1]
             init_mask = mask_image[:1]
@@ -466,7 +461,6 @@ def __call__(
 
             latents = init_mask * init_latents_proper + (1 - init_mask) * latents
         # post-processing
-        print(mask_image)
         latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
         image = self.vae.decode(latents, force_not_quantize=True)["sample"]
 

From 529632236c3d41f9b19ce8417834231f29323dc2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 22:30:34 +0000
Subject: [PATCH 04/51] rename pipelines: remove decoder + 2_2 -> V22

---
 src/diffusers/__init__.py                          | 14 +++++++-------
 src/diffusers/pipelines/__init__.py                | 14 +++++++-------
 src/diffusers/pipelines/kandinsky2_2/__init__.py   | 14 +++++++-------
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder.py  |  8 ++++----
 .../pipeline_kandinsky2_2_decoder_controlnet.py    |  2 +-
 ...line_kandinsky2_2_decoder_controlnet_img2img.py |  2 +-
 .../pipeline_kandinsky2_2_decoder_img2img.py       |  2 +-
 .../pipeline_kandinsky2_2_decoder_inpainting.py    |  2 +-
 .../kandinsky2_2/pipeline_kandinsky2_2_prior.py    | 14 +++++++-------
 .../pipeline_kandinsky2_2_prior_emb2emb.py         |  2 +-
 10 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 505af53e21d8..912d73f5b524 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -136,13 +136,13 @@
         KandinskyInpaintPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
-        Kandinsky2_2_DecoderControlnetImg2ImgPipeline,
-        Kandinsky2_2_DecoderControlnetPipeline,
-        Kandinsky2_2_DecoderImg2ImgPipeline,
-        Kandinsky2_2_DecoderPipeline,
-        Kandinsky2_2PriorEmb2EmbPipeline,
-        Kandinsky2_2PriorPipeline,
-        Kandinsky2_2_DecoderInpaintPipeline,
+        KandinskyV22ControlnetImg2ImgPipeline,
+        KandinskyV22ControlnetPipeline,
+        KandinskyV22Img2ImgPipelinee,
+        KandinskyV22Pipeline,
+        KandinskyV22PriorEmb2EmbPipeline,
+        KandinskyV22PriorPipeline,
+        KandinskyV22InpaintPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index be71f53e8415..eb06a6c8a6a5 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -64,13 +64,13 @@
         KandinskyPriorPipeline,
     )
     from .kandinsky2_2 import (
-        Kandinsky2_2_DecoderControlnetImg2ImgPipeline,
-        Kandinsky2_2_DecoderControlnetPipeline,
-        Kandinsky2_2_DecoderImg2ImgPipeline,
-        Kandinsky2_2_DecoderPipeline,
-        Kandinsky2_2PriorEmb2EmbPipeline,
-        Kandinsky2_2PriorPipeline,
-        Kandinsky2_2_DecoderInpaintPipeline
+        KandinskyV22ControlnetImg2ImgPipeline,
+        KandinskyV22ControlnetPipeline,
+        KandinskyV22Img2ImgPipeline,
+        KandinskyV22Pipeline,
+        KandinskyV22PriorEmb2EmbPipeline,
+        KandinskyV22PriorPipeline,
+        KandinskyV22InpaintPipeline,
     )
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index 3874862b3ecd..27899e94a410 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -1,8 +1,8 @@
-from .pipeline_kandinsky2_2_decoder_controlnet_img2img import Kandinsky2_2_DecoderControlnetImg2ImgPipeline
-from .pipeline_kandinsky2_2_decoder_controlnet import Kandinsky2_2_DecoderControlnetPipeline
-from .pipeline_kandinsky2_2_decoder_img2img import Kandinsky2_2_DecoderImg2ImgPipeline
-from .pipeline_kandinsky2_2_decoder_inpainting import Kandinsky2_2_DecoderInpaintPipeline
-from .pipeline_kandinsky2_2_decoder import Kandinsky2_2_DecoderPipeline
-from .pipeline_kandinsky2_2_prior_emb2emb import Kandinsky2_2PriorEmb2EmbPipeline
-from .pipeline_kandinsky2_2_prior import Kandinsky2_2PriorPipeline
+from .pipeline_kandinsky2_2_decoder_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+from .pipeline_kandinsky2_2_decoder_controlnet import KandinskyV22ControlnetPipeline
+from .pipeline_kandinsky2_2_decoder_img2img import KandinskyV22Img2ImgPipeline
+from .pipeline_kandinsky2_2_decoder_inpainting import KandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2_decoder import KandinskyV22Pipeline
+from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
+from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index 08dffdb4e51e..b08f0814dd65 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -34,15 +34,15 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> from diffusers import Kandinsky2_2_DecoderPipeline, Kandinsky2_2PriorPipeline
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
         >>> import torch
-        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
         >>> pipe_prior.to("cuda")
         >>> prompt = "red cat, 4k photo"
         >>> out = pipe_prior(prompt)
         >>> image_emb = out.image_embeds
         >>> zero_image_emb = out.negative_image_embeds
-        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
         >>> pipe.to("cuda")
         >>> image = pipe(
         ...     image_embeds=image_emb,
@@ -66,7 +66,7 @@ def get_new_h_w(h, w, scale_factor=8):
     return new_h * scale_factor, new_w * scale_factor
 
 
-class Kandinsky2_2_DecoderPipeline(DiffusionPipeline):
+class KandinskyV22Pipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index eb20356626c1..2a0fc598ad2a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -42,7 +42,7 @@ def get_new_h_w(h, w, scale_factor=8):
     return new_h * scale_factor, new_w * scale_factor
 
 
-class Kandinsky2_2_DecoderControlnetPipeline(DiffusionPipeline):
+class KandinskyV22ControlnetPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 18fbed543fb3..a32d50f82b8e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -53,7 +53,7 @@ def prepare_image(pil_image, w=512, h=512):
     return image
 
 
-class Kandinsky2_2_DecoderControlnetImg2ImgPipeline(DiffusionPipeline):
+class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 023ce445a863..d6e8df490903 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -53,7 +53,7 @@ def prepare_image(pil_image, w=512, h=512):
     return image
 
 
-class Kandinsky2_2_DecoderImg2ImgPipeline(DiffusionPipeline):
+class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 5f388e13603b..0e6eb765144b 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -182,7 +182,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
     return mask, image
 
-class Kandinsky2_2_DecoderInpaintPipeline(DiffusionPipeline):
+class KandinskyV22InpaintPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 4684d14bf8d3..24948291acea 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -23,15 +23,15 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> from diffusers import Kandinsky2_2_DecoderPipeline, Kandinsky2_2PriorPipeline
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
         >>> import torch
-        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
         >>> pipe_prior.to("cuda")
         >>> prompt = "red cat, 4k photo"
         >>> out = pipe_prior(prompt)
         >>> image_emb = out.image_embeds
         >>> zero_image_emb = out.negative_image_embeds
-        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
         >>> pipe.to("cuda")
         >>> image = pipe(
         ...     image_embeds=image_emb,
@@ -47,12 +47,12 @@
 EXAMPLE_INTERPOLATE_DOC_STRING = """
     Examples:
         ```py
-        >>> from diffusers import Kandinsky2_2PriorPipeline, Kandinsky2_2_DecoderPipeline
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
         >>> from diffusers.utils import load_image
         >>> import PIL
         >>> import torch
         >>> from torchvision import transforms
-        >>> pipe_prior = Kandinsky2_2PriorPipeline.from_pretrained(
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
         ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
         ... )
         >>> pipe_prior.to("cuda")
@@ -67,7 +67,7 @@
         >>> images_texts = ["a cat", img1, img2]
         >>> weights = [0.3, 0.3, 0.4]
         >>> out = pipe_prior.interpolate(images_texts, weights)
-        >>> pipe = Kandinsky2_2_DecoderPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
         >>> image = pipe(
         ...     image_embeds=out.image_embeds,
@@ -96,7 +96,7 @@ class KandinskyPriorPipelineOutput(BaseOutput):
     negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
 
 
-class Kandinsky2_2PriorPipeline(DiffusionPipeline):
+class KandinskyV22PriorPipeline(DiffusionPipeline):
     """
     Pipeline for generating image prior for Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 83a7343f4106..b6b06b0e86ae 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -34,7 +34,7 @@ class KandinskyPriorPipelineOutput(BaseOutput):
     negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
 
 
-class Kandinsky2_2PriorEmb2EmbPipeline(DiffusionPipeline):
+class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
     """
     Pipeline for generating image prior for Kandinsky
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the

From 8e6134d57fa35b80679580ee8b1c9a25aa143a6e Mon Sep 17 00:00:00 2001
From: Shahmatov Arseniy <62886550+cene555@users.noreply.github.com>
Date: Tue, 4 Jul 2023 01:43:41 +0300
Subject: [PATCH 05/51] Update scheduling_unclip.py

---
 src/diffusers/schedulers/scheduling_unclip.py | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index d44edcb1812a..1e362d5a5326 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -307,3 +307,27 @@ def step(
             return (pred_prev_sample,)
 
         return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples

From 5834a823cc04f0dcd758d663c50493a3a1995084 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 23:09:32 +0000
Subject: [PATCH 06/51] remove text_encoder and tokenizer arguments from doc
 string

---
 src/diffusers/pipelines/kandinsky2_2/__init__.py              | 3 +--
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py  | 4 ----
 .../pipeline_kandinsky2_2_decoder_controlnet_img2img.py       | 4 ----
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py     | 4 ----
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py  | 4 ----
 5 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index 27899e94a410..55e88abdd2ff 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -4,5 +4,4 @@
 from .pipeline_kandinsky2_2_decoder_inpainting import KandinskyV22InpaintPipeline
 from .pipeline_kandinsky2_2_decoder import KandinskyV22Pipeline
 from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
-from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
-
+from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 2a0fc598ad2a..5a08f4d11ad6 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -48,10 +48,6 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
-        text_encoder ([`MultilingualCLIP`]):
-            Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizer`]):
-            Tokenizer of class
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index a32d50f82b8e..2f9d67e5ab2c 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -59,10 +59,6 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
-        text_encoder ([`MultilingualCLIP`]):
-            Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizer`]):
-            Tokenizer of class
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index d6e8df490903..8412bda89a54 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -59,10 +59,6 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
-        text_encoder ([`MultilingualCLIP`]):
-            Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizer`]):
-            Tokenizer of class
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 0e6eb765144b..6f4593f1b2ca 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -188,10 +188,6 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
-        text_encoder ([`MultilingualCLIP`]):
-            Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizer`]):
-            Tokenizer of class
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):

From 62af41a0461c5acfcdacb308cd0966faa35c9c3b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 02:31:47 +0000
Subject: [PATCH 07/51] add test for text2img

---
 src/diffusers/__init__.py                                     | 2 +-
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 912d73f5b524..91996df6dac7 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -138,7 +138,7 @@
         KandinskyPriorPipeline,
         KandinskyV22ControlnetImg2ImgPipeline,
         KandinskyV22ControlnetPipeline,
-        KandinskyV22Img2ImgPipelinee,
+        KandinskyV22Img2ImgPipeline,
         KandinskyV22Pipeline,
         KandinskyV22PriorEmb2EmbPipeline,
         KandinskyV22PriorPipeline,
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index b08f0814dd65..02961faf3c32 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -235,10 +235,10 @@ def __call__(
 
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
-        batch_size = image_embeds.shape[0]
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
         if isinstance(negative_image_embeds, list):
             negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
-
+        
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)

From 0603e5a96f1e4f09a8388dd06d33cfa059ae4041 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 02:51:43 +0000
Subject: [PATCH 08/51] add tests for text2img & img2img

---
 tests/pipelines/kandinsky_v22/__init__.py     |   0
 .../pipelines/kandinsky_v22/test_kandinsky.py | 250 ++++++++++++++++
 .../kandinsky_v22/test_kandinsky_img2img.py   | 270 ++++++++++++++++++
 3 files changed, 520 insertions(+)
 create mode 100644 tests/pipelines/kandinsky_v22/__init__.py
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky.py
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py

diff --git a/tests/pipelines/kandinsky_v22/__init__.py b/tests/pipelines/kandinsky_v22/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
new file mode 100644
index 000000000000..c644ee195b79
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Pipeline
+    params = [
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.6237976, 1., 0.36441332, 1., 0.70639634, 0.29877186, 0.85652125, 0.5216843, 0.54454046]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_text2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_text2img_cat_fp16.npy"
+        )
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = "red cat, 4k photo"
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
new file mode 100644
index 000000000000..8d52aab0bfa3
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -0,0 +1,270 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import DDIMScheduler, KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.6199778, 0.63984406, 0.46145785, 0.62944984, 0.5622215, 0.47306132, 0.47441456, 0.4607606, 0.48719263]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_img2img_frog.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        np.save("kandinskyv22_img2img_frog.npy", image)
+
+        assert_mean_pixel_difference(image, expected_image)

From 64b95f40f567a5a59051d4cc79ccaf7688bccf8b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 02:52:31 +0000
Subject: [PATCH 09/51] fix

---
 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index 8d52aab0bfa3..a8ef916a27f5 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -265,6 +265,5 @@ def test_kandinsky_img2img(self):
         image = output.images[0]
 
         assert image.shape == (768, 768, 3)
-        np.save("kandinskyv22_img2img_frog.npy", image)
 
         assert_mean_pixel_difference(image, expected_image)

From 80df9c069bc5dede0a0a190c736bd4e226e463dc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 03:29:51 +0000
Subject: [PATCH 10/51] add test for inpaint

---
 ...ipeline_kandinsky2_2_decoder_inpainting.py |   2 +-
 .../kandinsky_v22/test_kandinsky_inpaint.py   | 279 ++++++++++++++++++
 2 files changed, 280 insertions(+), 1 deletion(-)
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 6f4593f1b2ca..03cf955e5054 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -360,7 +360,7 @@ def __call__(
 
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
-        batch_size = image_embeds.shape[0]
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
         if isinstance(negative_image_embeds, list):
             negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
new file mode 100644
index 000000000000..d3edb5d48c92
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import DDIMScheduler, KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 9,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create mask
+        mask = np.ones((64, 64), dtype=np.float32)
+        mask[:32, :32] = 0
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "mask_image": mask,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_inpaint(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        print(f"image.shape {image.shape}")
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.50775903, 0.49527195, 0.48824543, 0.50192237, 0.48644906, 0.49373814, 0.4780598, 0.47234827, 0.48327848]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_inpaint(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_inpaint_cat_with_hat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        mask = np.ones((768, 768), dtype=np.float32)
+        mask[:250, 250:-250] = 0
+
+        prompt = "a hat"
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22InpaintPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            mask_image=mask,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)

From 82d76df2ece64b2222d348deea898aeaa49c8270 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 03:34:17 +0000
Subject: [PATCH 11/51] add prior tests

---
 .../kandinsky_v22/test_kandinsky_prior.py     | 236 ++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior.py

diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
new file mode 100644
index 000000000000..1b8cefa91f4e
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
@@ -0,0 +1,236 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+        test_mean_pixel_difference = False
+
+        self._test_inference_batch_single_identical(
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )

From f72b53dbc095e2f03700f48ade7c07f68724b554 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 03:43:58 +0000
Subject: [PATCH 12/51] style

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/models/embeddings.py            | 42 +++++++++++--------
 src/diffusers/models/unet_2d_condition.py     | 18 ++++----
 src/diffusers/pipelines/__init__.py           |  2 +-
 .../pipelines/kandinsky2_2/__init__.py        |  6 +--
 .../pipeline_kandinsky2_2_decoder.py          |  9 ++--
 ...ipeline_kandinsky2_2_decoder_controlnet.py | 11 ++---
 ...kandinsky2_2_decoder_controlnet_img2img.py | 38 ++++-------------
 .../pipeline_kandinsky2_2_decoder_img2img.py  | 36 ++++------------
 ...ipeline_kandinsky2_2_decoder_inpainting.py | 27 ++++++------
 .../pipeline_kandinsky2_2_prior.py            |  2 +-
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 25 ++++++-----
 .../pipelines/kandinsky_v22/test_kandinsky.py | 10 +++--
 .../kandinsky_v22/test_kandinsky_img2img.py   | 12 +++++-
 .../kandinsky_v22/test_kandinsky_inpaint.py   | 12 +++++-
 15 files changed, 116 insertions(+), 136 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 91996df6dac7..c355ef3c73c3 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -139,10 +139,10 @@
         KandinskyV22ControlnetImg2ImgPipeline,
         KandinskyV22ControlnetPipeline,
         KandinskyV22Img2ImgPipeline,
+        KandinskyV22InpaintPipeline,
         KandinskyV22Pipeline,
         KandinskyV22PriorEmb2EmbPipeline,
         KandinskyV22PriorPipeline,
-        KandinskyV22InpaintPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 348f37d9190f..a5a0c5549ee9 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -375,6 +375,7 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso
 
         return torch.cat([image_text_embeds, text_embeds], dim=1)
 
+
 class ImageProjection(nn.Module):
     def __init__(
         self,
@@ -387,6 +388,7 @@ def __init__(
         self.num_image_text_embeds = num_image_text_embeds
         self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
         self.norm = nn.LayerNorm(cross_attention_dim)
+
     def forward(self, image_embeds: torch.FloatTensor):
         batch_size = image_embeds.shape[0]
 
@@ -449,47 +451,51 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso
 
         return time_image_embeds + time_text_embeds
 
+
 class ImageTimeEmbedding(nn.Module):
     def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
         super().__init__()
         self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
         self.image_norm = nn.LayerNorm(time_embed_dim)
 
-    def forward(self,image_embeds: torch.FloatTensor):
+    def forward(self, image_embeds: torch.FloatTensor):
         # image
         time_image_embeds = self.image_proj(image_embeds)
         time_image_embeds = self.image_norm(time_image_embeds)
         return time_image_embeds
 
+
 class ImageHintTimeEmbedding(nn.Module):
     def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
         super().__init__()
         self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
         self.image_norm = nn.LayerNorm(time_embed_dim)
         self.input_hint_block = nn.Sequential(
-                    nn.Conv2d(3, 16, 3, padding=1),
-                    nn.SiLU(),
-                    nn.Conv2d(16, 16, 3, padding=1),
-                    nn.SiLU(),
-                    nn.Conv2d(16, 32, 3, padding=1, stride=2),
-                    nn.SiLU(),
-                    nn.Conv2d(32, 32, 3, padding=1),
-                    nn.SiLU(),
-                    nn.Conv2d(32, 96, 3, padding=1, stride=2),
-                    nn.SiLU(),
-                    nn.Conv2d(96, 96, 3, padding=1),
-                    nn.SiLU(),
-                    nn.Conv2d(96, 256, 3, padding=1, stride=2),
-                    nn.SiLU(),
-                    nn.Conv2d(256, 4, 3, padding=1)
-                )
-    def forward(self,image_embeds: torch.FloatTensor, hint: torch.FloatTensor):
+            nn.Conv2d(3, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(256, 4, 3, padding=1),
+        )
+
+    def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor):
         # image
         time_image_embeds = self.image_proj(image_embeds)
         time_image_embeds = self.image_norm(time_image_embeds)
         hint = self.input_hint_block(hint)
         return time_image_embeds, hint
 
+
 class AttentionPooling(nn.Module):
     # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index fce6307d9be9..be3d4da8b46b 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -25,11 +25,11 @@
 from .attention_processor import AttentionProcessor, AttnProcessor
 from .embeddings import (
     GaussianFourierProjection,
-    TextImageProjection,
+    ImageHintTimeEmbedding,
     ImageProjection,
-    TextImageTimeEmbedding,
     ImageTimeEmbedding,
-    ImageHintTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
     TextTimeEmbedding,
     TimestepEmbedding,
     Timesteps,
@@ -358,15 +358,11 @@ def __init__(
                 text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
             )
         elif addition_embed_type == "image":
-            #Kadinsky 2.2
-            self.add_embedding = ImageTimeEmbedding(
-                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
-            )
+            # Kadinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type == "image_hint":
-            #Kadinsky 2.2
-            self.add_embedding = ImageHintTimeEmbedding(
-                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
-            ) 
+            # Kadinsky 2.2
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index eb06a6c8a6a5..06cd9b135424 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -67,10 +67,10 @@
         KandinskyV22ControlnetImg2ImgPipeline,
         KandinskyV22ControlnetPipeline,
         KandinskyV22Img2ImgPipeline,
+        KandinskyV22InpaintPipeline,
         KandinskyV22Pipeline,
         KandinskyV22PriorEmb2EmbPipeline,
         KandinskyV22PriorPipeline,
-        KandinskyV22InpaintPipeline,
     )
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index 55e88abdd2ff..c94725d4ae62 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -1,7 +1,7 @@
-from .pipeline_kandinsky2_2_decoder_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+from .pipeline_kandinsky2_2_decoder import KandinskyV22Pipeline
 from .pipeline_kandinsky2_2_decoder_controlnet import KandinskyV22ControlnetPipeline
+from .pipeline_kandinsky2_2_decoder_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
 from .pipeline_kandinsky2_2_decoder_img2img import KandinskyV22Img2ImgPipeline
 from .pipeline_kandinsky2_2_decoder_inpainting import KandinskyV22InpaintPipeline
-from .pipeline_kandinsky2_2_decoder import KandinskyV22Pipeline
+from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
 from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
-from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index 02961faf3c32..d7d50b76d9c7 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -106,7 +106,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -238,14 +237,12 @@ def __call__(
         batch_size = image_embeds.shape[0] * num_images_per_prompt
         if isinstance(negative_image_embeds, list):
             negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
-        
+
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
-            dtype=self.unet.dtype, device=device
-        )
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
@@ -317,4 +314,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ImagePipelineOutput(images=image)
\ No newline at end of file
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 5a08f4d11ad6..cbe616b8fb9d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -25,7 +25,6 @@
     is_accelerate_version,
     logging,
     randn_tensor,
-    replace_example_docstring,
 )
 
 
@@ -82,7 +81,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -226,9 +224,7 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
-            dtype=self.unet.dtype, device=device
-        )
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
@@ -251,8 +247,7 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
-            added_cond_kwargs = {"image_embeds": image_embeds,
-                                 "hint": hint}
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
             noise_pred = self.unet(
                 sample=latent_model_input,
                 timestep=t,
@@ -301,4 +296,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ImagePipelineOutput(images=image)
\ No newline at end of file
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 2f9d67e5ab2c..e6d86de6f75f 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from typing import List, Optional, Union
-import PIL
-from PIL import Image
+
 import numpy as np
+import PIL
 import torch
+from PIL import Image
 
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
@@ -27,7 +28,6 @@
     is_accelerate_version,
     logging,
     randn_tensor,
-    replace_example_docstring,
 )
 
 
@@ -89,8 +89,8 @@ def get_timesteps(self, num_inference_steps, strength, device):
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
 
-        return timesteps, num_inference_steps - t_start        
-        
+        return timesteps, num_inference_steps - t_start
+
     def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
@@ -121,22 +121,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
             init_latents = self.vae.config.scaling_factor * init_latents
 
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents], dim=0)
+        init_latents = torch.cat([init_latents], dim=0)
 
         shape = init_latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -150,7 +135,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -296,9 +280,7 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
-            dtype=self.unet.dtype, device=device
-        )
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
 
         if not isinstance(image, list):
             image = [image]
@@ -307,8 +289,6 @@ def __call__(
                 f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
             )
 
-        num_channels_latents = self.vae.config.latent_channels
-        
         image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
         image = image.to(dtype=image_embeds.dtype, device=device)
 
@@ -325,7 +305,7 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
-            added_cond_kwargs = {"image_embeds": image_embeds, 'hint': hint}
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
             noise_pred = self.unet(
                 sample=latent_model_input,
                 timestep=t,
@@ -378,4 +358,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ImagePipelineOutput(images=image)
\ No newline at end of file
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 8412bda89a54..85d1c921c262 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from typing import List, Optional, Union
-import PIL
-from PIL import Image
+
 import numpy as np
+import PIL
 import torch
+from PIL import Image
 
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
@@ -27,7 +28,6 @@
     is_accelerate_version,
     logging,
     randn_tensor,
-    replace_example_docstring,
 )
 
 
@@ -89,8 +89,8 @@ def get_timesteps(self, num_inference_steps, strength, device):
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
 
-        return timesteps, num_inference_steps - t_start        
-        
+        return timesteps, num_inference_steps - t_start
+
     def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
@@ -121,22 +121,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
             init_latents = self.vae.config.scaling_factor * init_latents
 
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents], dim=0)
+        init_latents = torch.cat([init_latents], dim=0)
 
         shape = init_latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -150,7 +135,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -293,9 +277,7 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
-            dtype=self.unet.dtype, device=device
-        )
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
 
         if not isinstance(image, list):
             image = [image]
@@ -304,8 +286,6 @@ def __call__(
                 f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
             )
 
-        num_channels_latents = self.unet.config.in_channels
-        
         image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
         image = image.to(dtype=image_embeds.dtype, device=device)
 
@@ -375,4 +355,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ImagePipelineOutput(images=image)
\ No newline at end of file
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 03cf955e5054..25d8154d9b7e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -12,9 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from copy import deepcopy
 from typing import List, Optional, Union
 
+import numpy as np
+import PIL
 import torch
+import torch.nn.functional as F
+from PIL import Image
 
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
@@ -25,13 +30,9 @@
     is_accelerate_version,
     logging,
     randn_tensor,
-    replace_example_docstring,
 )
-import numpy as np
-import PIL 
-from PIL import Image
-import torch.nn.functional as F
-from copy import deepcopy
+
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -44,6 +45,7 @@ def get_new_h_w(h, w, scale_factor=8):
         new_w += 1
     return new_h * scale_factor, new_w * scale_factor
 
+
 def prepare_mask(masks):
     prepared_masks = []
     for mask in masks:
@@ -67,6 +69,7 @@ def prepare_mask(masks):
         prepared_masks.append(mask)
     return torch.stack(prepared_masks, dim=0)
 
+
 def prepare_mask_and_masked_image(image, mask, height, width):
     r"""
     Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
@@ -182,6 +185,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
     return mask, image
 
+
 class KandinskyV22InpaintPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
@@ -222,7 +226,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -368,9 +371,7 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
-            dtype=self.unet.dtype, device=device
-        )
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
@@ -399,7 +400,7 @@ def __call__(
             masked_image = masked_image.repeat(2, 1, 1, 1)
 
         num_channels_latents = self.vae.config.latent_channels
-        
+
         height, width = get_new_h_w(height, width, self.vae_scale_factor)
 
         # create initial latent
@@ -416,7 +417,7 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
-            
+
             added_cond_kwargs = {"image_embeds": image_embeds}
             noise_pred = self.unet(
                 sample=latent_model_input,
@@ -474,4 +475,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ImagePipelineOutput(images=image)
\ No newline at end of file
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 24948291acea..2d7ab943b03f 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -539,4 +539,4 @@ def __call__(
         if not return_dict:
             return (image_embeddings, zero_embeds)
 
-        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
\ No newline at end of file
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index b6b06b0e86ae..888c97736e0d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -14,11 +14,12 @@
     is_accelerate_available,
     logging,
     randn_tensor,
-    replace_example_docstring,
 )
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
@@ -80,8 +81,8 @@ def get_timesteps(self, num_inference_steps, strength, device):
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
 
-        return timesteps, num_inference_steps - t_start    
-        
+        return timesteps, num_inference_steps - t_start
+
     @torch.no_grad()
     def interpolate(
         self,
@@ -172,11 +173,9 @@ def interpolate(
 
         image_emb = torch.cat(image_embeddings).sum(dim=0)
 
-
         return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
 
     def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
-
         emb = emb.to(device=device, dtype=dtype)
 
         batch_size = batch_size * num_images_per_prompt
@@ -200,10 +199,11 @@ def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtyp
         try:
             init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         except:
-            print('error')
+            print("error")
         latents = init_latents
 
         return latents
+
     def get_zero_embed(self, batch_size=1, device=None):
         device = device or self.device
         zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
@@ -426,7 +426,7 @@ def __call__(
 
         batch_size = len(prompt)
         batch_size = batch_size * num_images_per_prompt
-        
+
         do_classifier_free_guidance = guidance_scale > 1.0
         prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
@@ -434,12 +434,17 @@ def __call__(
 
         # prior
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        embedding_dim = self.prior.config.embedding_dim
         latents = emb.repeat_interleave(num_images_per_prompt, dim=0)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         latents = self.prepare_latents(
-            latents, latent_timestep, batch_size // num_images_per_prompt, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            latents,
+            latent_timestep,
+            batch_size // num_images_per_prompt,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
         )
         print(latents.shape, prompt_embeds.shape)
         for i, t in enumerate(self.progress_bar(timesteps)):
@@ -493,4 +498,4 @@ def __call__(
         if not return_dict:
             return (image_embeddings, zero_embeds)
 
-        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
\ No newline at end of file
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index c644ee195b79..ac970455b41b 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -147,7 +147,9 @@ def get_dummy_components(self):
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
@@ -188,7 +190,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.6237976, 1., 0.36441332, 1., 0.70639634, 0.29877186, 0.85652125, 0.5216843, 0.54454046]
+            [0.6237976, 1.0, 0.36441332, 1.0, 0.70639634, 0.29877186, 0.85652125, 0.5216843, 0.54454046]
         )
 
         assert (
@@ -220,7 +222,9 @@ def test_kandinsky_text2img(self):
         )
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        pipeline = KandinskyV22Pipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index a8ef916a27f5..4261b80dd20b 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -21,7 +21,13 @@
 import torch
 from PIL import Image
 
-from diffusers import DDIMScheduler, KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22Img2ImgPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
@@ -152,7 +158,9 @@ def get_dummy_components(self):
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
         # create init_image
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
index d3edb5d48c92..01692dac0bb4 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -21,7 +21,13 @@
 import torch
 from PIL import Image
 
-from diffusers import DDIMScheduler, KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22InpaintPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
@@ -151,7 +157,9 @@ def get_dummy_components(self):
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
         # create init_image
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]

From 8d05dbfc86434d5aafee57e2f38dffbbde2b0c95 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 03:44:50 +0000
Subject: [PATCH 13/51] copies

---
 .../versatile_diffusion/modeling_text_unet.py |  46 +++++++-
 .../dummy_torch_and_transformers_objects.py   | 105 ++++++++++++++++++
 2 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 0dd2351e6076..d5005ce09920 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -401,7 +401,12 @@ def __init__(
                 image_embed_dim=cross_attention_dim,
                 cross_attention_dim=cross_attention_dim,
             )
-
+        elif encoder_hid_dim_type == "image_proj":
+            # Kadinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
         elif encoder_hid_dim_type is not None:
             raise ValueError(
                 f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
@@ -454,6 +459,12 @@ def __init__(
             self.add_embedding = TextImageTimeEmbedding(
                 text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
             )
+        elif addition_embed_type == "image":
+            # Kadinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kadinsky 2.2
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
@@ -895,10 +906,31 @@ def forward(
 
             image_embs = added_cond_kwargs.get("image_embeds")
             text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-
             aug_emb = self.add_embedding(text_embs, image_embs)
             emb = emb + aug_emb
+        elif self.config.addition_embed_type == "image":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the"
+                    " keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
 
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+            emb = emb + aug_emb
+        elif self.config.addition_embed_type == "image_hint":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the"
+                    " keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            emb = emb + aug_emb
+            sample = torch.cat([sample, hint], dim=1)
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
@@ -914,7 +946,15 @@ def forward(
 
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
-
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kadinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires"
+                    " the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 3f0b17d879e5..6178a30d3970 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -227,6 +227,111 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class KandinskyV22ControlnetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22ControlnetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22InpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorEmb2EmbPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 374f237359d47e13839884c0955bc05dc6e142b4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 14:55:25 +0000
Subject: [PATCH 14/51] add controlnet test

---
 ...ipeline_kandinsky2_2_decoder_controlnet.py |  10 +-
 ...kandinsky2_2_decoder_controlnet_img2img.py |   7 +-
 .../test_kandinsky_controlnet.py              | 277 ++++++++++++++++++
 3 files changed, 291 insertions(+), 3 deletions(-)
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index cbe616b8fb9d..8b0549e8c7d8 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -216,16 +216,20 @@ def __call__(
 
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
-        batch_size = image_embeds.shape[0]
         if isinstance(negative_image_embeds, list):
             negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
+
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
 
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
 
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
-
+        hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
@@ -242,6 +246,8 @@ def __call__(
             latents,
             self.scheduler,
         )
+        print(f" h, w: {height}, {width}")
+        print(f" latents : {latents.shape}")
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index e6d86de6f75f..896076481e49 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -272,15 +272,20 @@ def __call__(
 
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
-        batch_size = image_embeds.shape[0]
         if isinstance(negative_image_embeds, list):
             negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
 
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
 
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
+        hint = torch.cat([hint, hint],dim=0).to(dtype=self.unet.dtype, device=device)
 
         if not isinstance(image, list):
             image = [image]
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
new file mode 100644
index 000000000000..8dad924aa81b
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, KandinskyV22ControlnetPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device, load_image
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from transformers import pipeline
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetPipeline
+    params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "hint"
+    ]
+    batch_params = ["image_embeds", "negative_image_embeds", "hint"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D", 
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D"
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": movq,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.6959826, 0.868279, 0.7558092, 0.68769467, 0.85805804, 0.65977496, 0.44885302, 0.5959111, 0.4251595]
+        )
+
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet(self):
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy"
+        )
+
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt = 'A robot, 4k photo'
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+            num_inference_steps=5,
+            negative_prompt="",
+        ).to_tuple()
+
+        generator = torch.Generator(device="cuda").manual_seed(0)
+        output = pipeline(
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint = hint,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)

From 365fac51cc0d563ad5d8d88e016069df52bb9a71 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 14:59:07 +0000
Subject: [PATCH 15/51] style

---
 ...kandinsky2_2_decoder_controlnet_img2img.py |  4 +--
 .../versatile_diffusion/modeling_text_unet.py |  3 ++
 src/diffusers/schedulers/scheduling_unclip.py |  1 -
 .../test_kandinsky_controlnet.py              | 35 ++++++++-----------
 4 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 896076481e49..50c40fb22567 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -278,14 +278,14 @@ def __call__(
             hint = torch.cat(hint, dim=0)
 
         batch_size = image_embeds.shape[0] * num_images_per_prompt
-        
+
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
 
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device)
-        hint = torch.cat([hint, hint],dim=0).to(dtype=self.unet.dtype, device=device)
+        hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
 
         if not isinstance(image, list):
             image = [image]
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index d5005ce09920..87d803e739b0 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -18,6 +18,9 @@
 from ...models.dual_transformer_2d import DualTransformer2DModel
 from ...models.embeddings import (
     GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
     TextImageProjection,
     TextImageTimeEmbedding,
     TextTimeEmbedding,
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 1e362d5a5326..726c05a872e4 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -308,7 +308,6 @@ def step(
 
         return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
 
-    
     def add_noise(
         self,
         original_samples: torch.FloatTensor,
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
index 8dad924aa81b..97e57de34c85 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -20,12 +20,16 @@
 import numpy as np
 import torch
 
-from diffusers import DDIMScheduler, KandinskyV22ControlnetPipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
-from diffusers.utils import floats_tensor, load_numpy, slow, torch_device, load_image
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetPipeline,
+    KandinskyV22PriorPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
-from transformers import pipeline
-
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
@@ -34,11 +38,7 @@
 
 class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22ControlnetPipeline
-    params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "hint"
-    ]
+    params = ["image_embeds", "negative_image_embeds", "hint"]
     batch_params = ["image_embeds", "negative_image_embeds", "hint"]
     required_optional_params = [
         "generator",
@@ -105,10 +105,11 @@ def dummy_movq_kwargs(self):
         return {
             "block_out_channels": [32, 32, 64, 64],
             "down_block_types": [
-                "DownEncoderBlock2D", 
                 "DownEncoderBlock2D",
                 "DownEncoderBlock2D",
-                "AttnDownEncoderBlock2D"],
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
             "in_channels": 3,
             "latent_channels": 4,
             "layers_per_block": 1,
@@ -116,12 +117,7 @@ def dummy_movq_kwargs(self):
             "norm_type": "spatial",
             "num_vq_embeddings": 12,
             "out_channels": 3,
-            "up_block_types": [
-                "AttnUpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D"
-            ],
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
             "vq_embed_dim": 4,
         }
 
@@ -226,7 +222,6 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_kandinsky_controlnet(self):
-
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy"
@@ -250,7 +245,7 @@ def test_kandinsky_controlnet(self):
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
-        prompt = 'A robot, 4k photo'
+        prompt = "A robot, 4k photo"
 
         generator = torch.Generator(device="cuda").manual_seed(0)
         image_emb, zero_image_emb = pipe_prior(
@@ -264,7 +259,7 @@ def test_kandinsky_controlnet(self):
         output = pipeline(
             image_embeds=image_emb,
             negative_image_embeds=zero_image_emb,
-            hint = hint,
+            hint=hint,
             generator=generator,
             num_inference_steps=100,
             output_type="np",

From cec91607d9bc2354ea29517c859500c90e82ad1f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 19:39:49 +0000
Subject: [PATCH 16/51] add a test for controlnet_img2img

---
 ...ipeline_kandinsky2_2_decoder_controlnet.py |   2 -
 .../test_kandinsky_controlnet_img2img.py      | 301 ++++++++++++++++++
 2 files changed, 301 insertions(+), 2 deletions(-)
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 8b0549e8c7d8..0c7fcedd1dc7 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -246,8 +246,6 @@ def __call__(
             latents,
             self.scheduler,
         )
-        print(f" h, w: {height}, {width}")
-        print(f" latents : {latents.shape}")
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
new file mode 100644
index 000000000000..fb39aa2a552d
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    DDIMScheduler,
+    KandinskyV22ControlnetImg2ImgPipeline,
+    KandinskyV22PriorPipeline,
+    KandinskyV22PriorEmb2EmbPipeline,
+    UNet2DConditionModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22ControlnetImg2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "hint"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "hint"
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 8,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "addition_embed_type": "image_hint",
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "image_proj",
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": None,
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 32, 64, 64],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "AttnDownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        unet = self.dummy_unet
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule": "linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to(
+            device
+        )
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create hint
+        hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "hint": hint,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.0,
+            "strength": 0.2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_controlnet_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.54985034, 0.55509365, 0.52561504, 0.5570494,  0.5593818,  0.5263979, 0.50285643, 0.5069846,  0.51196736]
+        )
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+
+
+@slow
+@require_torch_gpu
+class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_controlnet_img2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/kandinskyv22_controlnet_img2img_robotcat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        init_image = init_image.resize((512,512))
+        
+        hint = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinskyv22/hint_image_cat.png"
+        )
+        hint = torch.from_numpy(np.array(hint)).float() / 255.0
+        hint = hint.permute(2, 0, 1).unsqueeze(0)
+
+        prompt = "A robot, 4k photo"
+        
+
+        pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        )
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        
+        clip_img_emb = pipe_prior.interpolate(
+            images_and_prompts=[init_image], 
+            weights=[1]).image_embeds
+
+        image_emb, zero_image_emb = pipe_prior(
+            prompt,
+            emb=clip_img_emb,
+            strength=0.85,
+            generator=generator,
+            negative_prompt="",
+        ).to_tuple()
+
+        output = pipeline(
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            hint=hint,
+            generator=generator,
+            num_inference_steps=100,
+            height=512,
+            width=512,
+            strength=0.5,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)

From a27b520aa4225e7a9a098a808557d1fbdbe8aa68 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 22:07:53 +0000
Subject: [PATCH 17/51] update prior_emb2emb api to accept image_embedding or
 image

---
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 57 +++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 888c97736e0d..17e24016061d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -154,15 +154,7 @@ def interpolate(
                 ).image_embeds.unsqueeze(0)
 
             elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
-                if isinstance(cond, PIL.Image.Image):
-                    cond = (
-                        self.image_processor(cond, return_tensors="pt")
-                        .pixel_values[0]
-                        .unsqueeze(0)
-                        .to(dtype=self.image_encoder.dtype, device=device)
-                    )
-
-                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+                image_emb = self._encode_image(cond, device=device, num_images_per_prompt=num_images_per_prompt).unsqueeze(0)
 
             else:
                 raise ValueError(
@@ -174,6 +166,26 @@ def interpolate(
         image_emb = torch.cat(image_embeddings).sum(dim=0)
 
         return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
+    
+    def _encode_image(
+        self, 
+        image: Union[torch.Tensor, List[PIL.Image.Image]],
+        device,
+        num_images_per_prompt,
+        ):
+        
+        if not isinstance(image, torch.Tensor):
+            image = (
+                self.image_processor(image, return_tensors="pt")
+                .pixel_values
+                .to(dtype=self.image_encoder.dtype, device=device)
+            )
+
+        image_emb = self.image_encoder(image)["image_embeds"] # B, D
+        image_emb = image_emb.repeat_interleave(num_images_per_prompt, dim=0)
+        image_emb.to(device=device)
+
+        return image_emb
 
     def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         emb = emb.to(device=device, dtype=dtype)
@@ -196,10 +208,7 @@ def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtyp
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
-        try:
-            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        except:
-            print("error")
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         latents = init_latents
 
         return latents
@@ -353,7 +362,7 @@ def _encode_prompt(
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        emb: torch.FloatTensor,
+        image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]],
         strength: float = 0.3,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
@@ -431,10 +440,26 @@ def __call__(
         prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
+        
+        if not isinstance(image, List):
+            image = [image]
+        
+        if isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+        
+        if isinstance(image, torch.Tensor) and image.ndim == 2:
+            # allow user to pass image_embeds directly
+            image_embeds = image.repeat_interleave(num_images_per_prompt, dim=0)
+        elif isinstance(image, torch.Tensor) and image.ndim != 4:
+            raise ValueError(f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}")
+        else:
+            image_embeds = self._encode_image(image,device,num_images_per_prompt)
+
 
         # prior
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        latents = emb.repeat_interleave(num_images_per_prompt, dim=0)
+        
+        latents = image_embeds
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         latents = self.prepare_latents(
@@ -446,7 +471,7 @@ def __call__(
             device,
             generator,
         )
-        print(latents.shape, prompt_embeds.shape)
+
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents

From b4189a130db44c2329761008123cd1018af478ad Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 23:07:24 +0000
Subject: [PATCH 18/51] add a test for prior_emb2emb

---
 .../pipeline_kandinsky2_2_prior_emb2emb.py    |   2 +-
 .../test_kandinsky_prior_emb2emb.py           | 247 ++++++++++++++++++
 2 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 17e24016061d..163bd6a6088e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -461,7 +461,7 @@ def __call__(
         
         latents = image_embeds
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latent_timestep = timesteps[:1].repeat(batch_size)
         latents = self.prepare_latents(
             latents,
             latent_timestep,
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
new file mode 100644
index 000000000000..c6b9028ddfcf
--- /dev/null
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import random
+from PIL import Image
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils import torch_device, floats_tensor
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorEmb2EmbPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "image"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "strength",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        image_processor = self.dummy_image_processor
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "image_processor": image_processor,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "strength": 0.5,
+            "generator": generator,
+            "guidance_scale": 4.0,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_prior_emb2emb(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.image_embeds
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [0.1071284, 1.3330271, 0.61260223, -0.6691065, -0.3846852, -1.0303661, 0.22716111, 0.03348901, 0.30040675, -0.24805029]
+        )
+
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+        test_mean_pixel_difference = False
+
+        self._test_inference_batch_single_identical(
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )

From 4a5c6ac14de8100e3a4f2c9cabc265969c878cd2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 23:28:22 +0000
Subject: [PATCH 19/51] style

---
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 34 +++++++++----------
 .../test_kandinsky_controlnet_img2img.py      | 21 +++---------
 .../test_kandinsky_prior_emb2emb.py           | 24 +++++++++----
 3 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 163bd6a6088e..921b409f82af 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -154,7 +154,9 @@ def interpolate(
                 ).image_embeds.unsqueeze(0)
 
             elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
-                image_emb = self._encode_image(cond, device=device, num_images_per_prompt=num_images_per_prompt).unsqueeze(0)
+                image_emb = self._encode_image(
+                    cond, device=device, num_images_per_prompt=num_images_per_prompt
+                ).unsqueeze(0)
 
             else:
                 raise ValueError(
@@ -166,22 +168,19 @@ def interpolate(
         image_emb = torch.cat(image_embeddings).sum(dim=0)
 
         return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
-    
+
     def _encode_image(
-        self, 
+        self,
         image: Union[torch.Tensor, List[PIL.Image.Image]],
         device,
         num_images_per_prompt,
-        ):
-        
+    ):
         if not isinstance(image, torch.Tensor):
-            image = (
-                self.image_processor(image, return_tensors="pt")
-                .pixel_values
-                .to(dtype=self.image_encoder.dtype, device=device)
+            image = self.image_processor(image, return_tensors="pt").pixel_values.to(
+                dtype=self.image_encoder.dtype, device=device
             )
 
-        image_emb = self.image_encoder(image)["image_embeds"] # B, D
+        image_emb = self.image_encoder(image)["image_embeds"]  # B, D
         image_emb = image_emb.repeat_interleave(num_images_per_prompt, dim=0)
         image_emb.to(device=device)
 
@@ -440,25 +439,26 @@ def __call__(
         prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-        
+
         if not isinstance(image, List):
             image = [image]
-        
+
         if isinstance(image[0], torch.Tensor):
             image = torch.cat(image, dim=0)
-        
+
         if isinstance(image, torch.Tensor) and image.ndim == 2:
             # allow user to pass image_embeds directly
             image_embeds = image.repeat_interleave(num_images_per_prompt, dim=0)
         elif isinstance(image, torch.Tensor) and image.ndim != 4:
-            raise ValueError(f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}")
+            raise ValueError(
+                f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}"
+            )
         else:
-            image_embeds = self._encode_image(image,device,num_images_per_prompt)
-
+            image_embeds = self._encode_image(image, device, num_images_per_prompt)
 
         # prior
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        
+
         latents = image_embeds
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
index fb39aa2a552d..3b717c68a3d0 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -24,7 +24,6 @@
 from diffusers import (
     DDIMScheduler,
     KandinskyV22ControlnetImg2ImgPipeline,
-    KandinskyV22PriorPipeline,
     KandinskyV22PriorEmb2EmbPipeline,
     UNet2DConditionModel,
     VQModel,
@@ -41,12 +40,7 @@
 class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22ControlnetImg2ImgPipeline
     params = ["image_embeds", "negative_image_embeds", "image", "hint"]
-    batch_params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-        "hint"
-    ]
+    batch_params = ["image_embeds", "negative_image_embeds", "image", "hint"]
     required_optional_params = [
         "generator",
         "height",
@@ -215,7 +209,7 @@ def test_kandinsky_controlnet_img2img(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.54985034, 0.55509365, 0.52561504, 0.5570494,  0.5593818,  0.5263979, 0.50285643, 0.5069846,  0.51196736]
+            [0.54985034, 0.55509365, 0.52561504, 0.5570494, 0.5593818, 0.5263979, 0.50285643, 0.5069846, 0.51196736]
         )
         assert (
             np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -243,8 +237,8 @@ def test_kandinsky_controlnet_img2img(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
         )
-        init_image = init_image.resize((512,512))
-        
+        init_image = init_image.resize((512, 512))
+
         hint = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/kandinskyv22/hint_image_cat.png"
@@ -253,7 +247,6 @@ def test_kandinsky_controlnet_img2img(self):
         hint = hint.permute(2, 0, 1).unsqueeze(0)
 
         prompt = "A robot, 4k photo"
-        
 
         pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
             "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
@@ -268,14 +261,10 @@ def test_kandinsky_controlnet_img2img(self):
         pipeline.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        
-        clip_img_emb = pipe_prior.interpolate(
-            images_and_prompts=[init_image], 
-            weights=[1]).image_embeds
 
         image_emb, zero_image_emb = pipe_prior(
             prompt,
-            emb=clip_img_emb,
+            image=init_image,
             strength=0.85,
             generator=generator,
             negative_prompt="",
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
index c6b9028ddfcf..8e8caec181a1 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 import random
-from PIL import Image
+import unittest
 
 import numpy as np
 import torch
+from PIL import Image
 from torch import nn
 from transformers import (
     CLIPImageProcessor,
@@ -30,7 +30,7 @@
 )
 
 from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils import torch_device, floats_tensor
+from diffusers.utils import floats_tensor, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
 
 from ..test_pipelines_common import PipelineTesterMixin
@@ -177,11 +177,11 @@ def get_dummy_inputs(self, device, seed=0):
             generator = torch.manual_seed(seed)
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
-        
+
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-        
+
         inputs = {
             "prompt": "horse",
             "image": init_image,
@@ -217,10 +217,20 @@ def test_kandinsky_prior_emb2emb(self):
         assert image.shape == (1, 32)
 
         expected_slice = np.array(
-            [0.1071284, 1.3330271, 0.61260223, -0.6691065, -0.3846852, -1.0303661, 0.22716111, 0.03348901, 0.30040675, -0.24805029]
+            [
+                0.1071284,
+                1.3330271,
+                0.61260223,
+                -0.6691065,
+                -0.3846852,
+                -1.0303661,
+                0.22716111,
+                0.03348901,
+                0.30040675,
+                -0.24805029,
+            ]
         )
 
-
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 

From 8fc24e6d047ff622edc5771d53c309c3d73d454f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 23:33:52 +0000
Subject: [PATCH 20/51] remove try except

---
 .../pipeline_kandinsky2_2_decoder.py          | 12 +++---
 ...ipeline_kandinsky2_2_decoder_controlnet.py | 11 +++---
 ...kandinsky2_2_decoder_controlnet_img2img.py | 37 +++++++------------
 .../pipeline_kandinsky2_2_decoder_img2img.py  | 37 +++++++------------
 ...ipeline_kandinsky2_2_decoder_inpainting.py | 19 +++++-----
 .../pipeline_kandinsky2_2_prior.py            | 24 ++++++------
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 18 ++++-----
 7 files changed, 69 insertions(+), 89 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index d7d50b76d9c7..362507562198 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -36,6 +36,7 @@
         ```py
         >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
         >>> import torch
+
         >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
         >>> pipe_prior.to("cuda")
         >>> prompt = "red cat, 4k photo"
@@ -68,10 +69,10 @@ def get_new_h_w(h, w, scale_factor=8):
 
 class KandinskyV22Pipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         scheduler ([`DDPMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -191,8 +192,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
@@ -224,8 +225,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 0c7fcedd1dc7..397b8fff739a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -43,10 +43,10 @@ def get_new_h_w(h, w, scale_factor=8):
 
 class KandinskyV22ControlnetPipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -166,8 +166,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             hint (`torch.FloatTensor`):
@@ -206,8 +206,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 50c40fb22567..f66d3aa0a4c8 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -55,10 +55,10 @@ def prepare_image(pil_image, w=512, h=512):
 
 class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -127,10 +127,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
-        try:
-            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        except:
-            pass
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         latents = init_latents
 
         return latents
@@ -221,8 +218,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
@@ -262,8 +259,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
@@ -333,19 +329,14 @@ def __call__(
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             # compute the previous noisy sample x_t -> x_t-1
-            try:
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    generator=generator,
-                )[0]
-            except:
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                )[0]
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
         # post-processing
         image = self.vae.decode(latents, force_not_quantize=True)["sample"]
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 85d1c921c262..fa289481cfe7 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -55,10 +55,10 @@ def prepare_image(pil_image, w=512, h=512):
 
 class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -127,10 +127,8 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
-        try:
-            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        except:
-            pass
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
         latents = init_latents
 
         return latents
@@ -220,8 +218,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
@@ -259,8 +257,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
@@ -325,19 +322,13 @@ def __call__(
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             # compute the previous noisy sample x_t -> x_t-1
-            try:
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    generator=generator,
-                )[0]
-            except:
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                )[0]
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
         # post-processing
         image = self.vae.decode(latents, force_not_quantize=True)["sample"]
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 25d8154d9b7e..5f091455131a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -188,10 +188,10 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
 class KandinskyV22InpaintPipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -312,18 +312,18 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`PIL.Image.Image`):
                 `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
                 be masked out with `mask_image` and repainted according to `prompt`.
             mask_image (`np.array`):
-                Tensor representing an image batch, to mask `image`. Black pixels in the mask will be
-                repainted, while white pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+                Tensor representing an image batch, to mask `image`. Black pixels in the mask will be repainted, while
+                white pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for negative text prompt, will be used to condition the image generation.
             height (`int`, *optional*, defaults to 512):
@@ -353,8 +353,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 2d7ab943b03f..ecee28a7a855 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -25,6 +25,7 @@
         ```py
         >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
         >>> import torch
+
         >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
         >>> pipe_prior.to("cuda")
         >>> prompt = "red cat, 4k photo"
@@ -52,6 +53,7 @@
         >>> import PIL
         >>> import torch
         >>> from torchvision import transforms
+
         >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
         ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
         ... )
@@ -67,7 +69,9 @@
         >>> images_texts = ["a cat", img1, img2]
         >>> weights = [0.3, 0.3, 0.4]
         >>> out = pipe_prior.interpolate(images_texts, weights)
-        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
         >>> pipe.to("cuda")
         >>> image = pipe(
         ...     image_embeds=out.image_embeds,
@@ -84,8 +88,8 @@
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
-    Output class for KandinskyPriorPipeline.
     Args:
+    Output class for KandinskyPriorPipeline.
         image_embeds (`torch.FloatTensor`)
             clip image embeddings for text prompt
         negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
@@ -98,10 +102,10 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 class KandinskyV22PriorPipeline(DiffusionPipeline):
     """
-    Pipeline for generating image prior for Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for generating image prior for Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -153,8 +157,8 @@ def interpolate(
         device=None,
     ):
         """
-        Function invoked when using the prior pipeline for interpolation.
         Args:
+        Function invoked when using the prior pipeline for interpolation.
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
                 list of prompts and images to guide the image generation.
             weights: (`List[float]`):
@@ -183,8 +187,7 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
@@ -412,8 +415,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             negative_prompt (`str` or `List[str]`, *optional*):
@@ -442,8 +445,7 @@ def __call__(
                 (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 921b409f82af..577703b2f341 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -23,8 +23,8 @@
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
-    Output class for KandinskyPriorPipeline.
     Args:
+    Output class for KandinskyPriorPipeline.
         image_embeds (`torch.FloatTensor`)
             clip image embeddings for text prompt
         negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
@@ -37,10 +37,10 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
     """
-    Pipeline for generating image prior for Kandinsky
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
     Args:
+    Pipeline for generating image prior for Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
+    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
+    or saving, running on a particular device, etc.)
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -98,8 +98,8 @@ def interpolate(
         device=None,
     ):
         """
-        Function invoked when using the prior pipeline for interpolation.
         Args:
+        Function invoked when using the prior pipeline for interpolation.
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
                 list of prompts and images to guide the image generation.
             weights: (`List[float]`):
@@ -128,8 +128,7 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
@@ -373,8 +372,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             strength (`float`, *optional*, defaults to 0.8):
@@ -409,8 +408,7 @@ def __call__(
                 (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples:
-        Returns:
+        Examples: Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 

From 1480cdc45d5bb4b85bec143f1a343a23410f5d35 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 4 Jul 2023 23:46:37 +0000
Subject: [PATCH 21/51] example

---
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder.py      |  4 +++-
 .../pipeline_kandinsky2_2_decoder_controlnet.py        |  5 ++++-
 ...pipeline_kandinsky2_2_decoder_controlnet_img2img.py |  5 ++++-
 .../pipeline_kandinsky2_2_decoder_img2img.py           |  5 ++++-
 .../pipeline_kandinsky2_2_decoder_inpainting.py        |  5 ++++-
 .../kandinsky2_2/pipeline_kandinsky2_2_prior.py        | 10 ++++++++--
 .../pipeline_kandinsky2_2_prior_emb2emb.py             |  5 ++++-
 7 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index 362507562198..c699fd8909b6 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -225,7 +225,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        Examples: 
+        
+        Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 397b8fff739a..d518ccbb74d7 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -206,7 +206,10 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index f66d3aa0a4c8..452aa755da90 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -259,7 +259,10 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index fa289481cfe7..2d45223fabcf 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -257,7 +257,10 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 5f091455131a..8e88f469fdef 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -353,7 +353,10 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
         device = self._execution_device
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index ecee28a7a855..9f5c9057e5f8 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -187,7 +187,10 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
@@ -445,7 +448,10 @@ def __call__(
                 (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 577703b2f341..bd2f2f71753f 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -128,7 +128,10 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        Examples: Returns:
+        
+        Examples: 
+        
+        Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 

From ce7ea47995bc61fb63215a742fe09b67e6353eb6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 01:31:47 +0000
Subject: [PATCH 22/51] fix

---
 .../pipeline_kandinsky2_2_decoder_controlnet_img2img.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 452aa755da90..bf9aff863d2d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -276,7 +276,7 @@ def __call__(
         if isinstance(hint, list):
             hint = torch.cat(hint, dim=0)
 
-        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        batch_size = image_embeds.shape[0]
 
         if do_classifier_free_guidance:
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)

From 935614d9b5682fd705f80c920e8fb0046291d01a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 03:34:32 +0000
Subject: [PATCH 23/51] add doc string examples to all kandinsky pipelines

---
 .../pipeline_kandinsky2_2_decoder.py          |  16 +--
 ...ipeline_kandinsky2_2_decoder_controlnet.py |  79 +++++++++++--
 ...kandinsky2_2_decoder_controlnet_img2img.py |  79 +++++++++++--
 .../pipeline_kandinsky2_2_decoder_img2img.py  |  60 ++++++++--
 ...ipeline_kandinsky2_2_decoder_inpainting.py |  58 +++++++++-
 .../pipeline_kandinsky2_2_prior.py            |  34 +++---
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 106 ++++++++++++++++--
 7 files changed, 367 insertions(+), 65 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index c699fd8909b6..d128e699a01e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -69,11 +69,13 @@ def get_new_h_w(h, w, scale_factor=8):
 
 class KandinskyV22Pipeline(DiffusionPipeline):
     """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
-        scheduler ([`DDPMScheduler`]):
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
@@ -192,8 +194,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+        Args:
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
@@ -225,6 +227,7 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        
         Examples: 
         
         Returns:
@@ -294,9 +297,6 @@ def __call__(
                 noise_pred,
                 t,
                 latents,
-                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
-                #             need to use DDPM scheduler instead
-                # prev_timestep=prev_timestep,
                 generator=generator,
             )[0]
         # post-processing
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index d518ccbb74d7..f6daee890e3a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -30,6 +30,64 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+        >>> def make_hint(image, depth_estimator):
+        ...   image = depth_estimator(image)['depth']
+        ...   image = np.array(image)
+        ...   image = image[:, :, None]
+        ...   image = np.concatenate([image, image, image], axis=2)
+        ...   detected_map = torch.from_numpy(image).float() / 255.0
+        ...   hint = detected_map.permute(2, 0, 1)
+        ...   return hint
+
+        >>> depth_estimator = pipeline('depth-estimation')
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+
+        >>> img = load_image(
+        ...              "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...              "/kandinsky/cat.png"
+        ...         ).resize((768, 768))
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+
+        >>> prompt = 'A robot, 4k photo'
+        >>> negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+
+        >>> generator = torch.Generator(device='cuda').manual_seed(43)
+
+        >>> image_emb, zero_image_emb = pipe_prior(
+        ...     prompt=prompt, 
+        ...     negative_prompt=negative_prior_prompt, 
+        ...     generator=generator).to_tuple()
+
+        >>> images = pipe(
+        ...     image_embeds=image_emb, 
+        ...     negative_image_embeds=zero_image_emb, 
+        ...     hint=hint, 
+        ...     num_inference_steps=50, 
+        ...     generator=generator,
+        ...     height=768, 
+        ...     width=768).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -43,10 +101,12 @@ def get_new_h_w(h, w, scale_factor=8):
 
 class KandinskyV22ControlnetPipeline(DiffusionPipeline):
     """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -150,6 +210,7 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -166,8 +227,9 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+
+        Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             hint (`torch.FloatTensor`):
@@ -206,9 +268,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
@@ -280,9 +342,6 @@ def __call__(
                 noise_pred,
                 t,
                 latents,
-                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
-                #             need to use DDPM scheduler instead
-                # prev_timestep=prev_timestep,
                 generator=generator,
             )[0]
         # post-processing
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index bf9aff863d2d..147daa9a6214 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -33,6 +33,64 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+        >>> def make_hint(image, depth_estimator):
+        ...   image = depth_estimator(image)['depth']
+        ...   image = np.array(image)
+        ...   image = image[:, :, None]
+        ...   image = np.concatenate([image, image, image], axis=2)
+        ...   detected_map = torch.from_numpy(image).float() / 255.0
+        ...   hint = detected_map.permute(2, 0, 1)
+        ...   return hint
+
+        >>> depth_estimator = pipeline('depth-estimation')
+
+        >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> img = load_image(
+        ...              "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...             "/kandinsky/cat.png"
+        ...         ).resize((768, 768))
+
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+
+        >>> prompt = 'A robot, 4k photo'
+        >>> negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+
+        >>> generator = torch.Generator(device='cuda').manual_seed(43) 
+
+        >>> img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
+        >>> negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
+
+        >>> images = pipe(
+        ...     image=img, 
+        ...     strength=0.5, 
+        ...     image_embeds=img_emb.image_embeds, 
+        ...     negative_image_embeds=negative_emb.image_embeds, 
+        ...     hint=hint, 
+        ...     num_inference_steps=50, 
+        ...     generator=generator,
+        ...     height=768, 
+        ...     width=768).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -55,15 +113,17 @@ def prepare_image(pil_image, w=512, h=512):
 
 class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
     """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        movq ([`VQModel`]):
+        vae ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -201,6 +261,7 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -218,8 +279,10 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+
+        Args:
+
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
@@ -259,9 +322,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 2d45223fabcf..39693b3da0cb 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -33,6 +33,45 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
+        ... ).images
+
+        >>> image[0].save("red_frog.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -55,15 +94,17 @@ def prepare_image(pil_image, w=512, h=512):
 
 class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
     """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        movq ([`VQModel`]):
+        vae ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -202,6 +243,7 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -218,8 +260,10 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+
+        Args:
+
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
@@ -257,9 +301,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 8e88f469fdef..88130a52eb9b 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -35,6 +35,50 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> import numpy as np
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "a hat"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22InpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> mask = np.ones((768, 768), dtype=np.float32)
+        >>> mask[:250, 250:-250] = 0
+
+        >>> out = pipe(
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -188,15 +232,18 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
 class KandinskyV22InpaintPipeline(DiffusionPipeline):
     """
+    Pipeline for text-guided image inpainting using Kandinsky2.1
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for text-to-image generation using Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
+
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        movq ([`VQModel`]):
+        vae ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -295,6 +342,7 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -312,8 +360,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+        Args:
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`PIL.Image.Image`):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 9f5c9057e5f8..4aced813af6e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -29,14 +29,13 @@
         >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
         >>> pipe_prior.to("cuda")
         >>> prompt = "red cat, 4k photo"
-        >>> out = pipe_prior(prompt)
-        >>> image_emb = out.image_embeds
-        >>> zero_image_emb = out.negative_image_embeds
+        >>> image_emb, negative_image_emb = pipe_prior(prompt).to_tuple()
+
         >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
         >>> pipe.to("cuda")
         >>> image = pipe(
         ...     image_embeds=image_emb,
-        ...     negative_image_embeds=zero_image_emb,
+        ...     negative_image_embeds=negative_image_emb,
         ...     height=768,
         ...     width=768,
         ...     num_inference_steps=50,
@@ -101,11 +100,13 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 
 class KandinskyV22PriorPipeline(DiffusionPipeline):
-    """
+    """    
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for generating image prior for Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -157,8 +158,8 @@ def interpolate(
         device=None,
     ):
         """
-        Args:
         Function invoked when using the prior pipeline for interpolation.
+        Args:
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
                 list of prompts and images to guide the image generation.
             weights: (`List[float]`):
@@ -187,9 +188,9 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
@@ -418,8 +419,9 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+
+        Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             negative_prompt (`str` or `List[str]`, *optional*):
@@ -448,9 +450,9 @@ def __call__(
                 (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index bd2f2f71753f..1b60c88e0089 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -14,17 +14,94 @@
     is_accelerate_available,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16)
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple()
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
 
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
-    Args:
     Output class for KandinskyPriorPipeline.
+
+    Args:
         image_embeds (`torch.FloatTensor`)
             clip image embeddings for text prompt
         negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
@@ -37,10 +114,12 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
     """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
     Args:
-    Pipeline for generating image prior for Kandinsky This model inherits from [`DiffusionPipeline`]. Check the
-    superclass documentation for the generic methods the library implements for all the pipelines (such as downloading
-    or saving, running on a particular device, etc.)
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -84,6 +163,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         return timesteps, num_inference_steps - t_start
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
     def interpolate(
         self,
         images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
@@ -98,8 +178,9 @@ def interpolate(
         device=None,
     ):
         """
-        Args:
         Function invoked when using the prior pipeline for interpolation.
+
+        Args:
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
                 list of prompts and images to guide the image generation.
             weights: (`List[float]`):
@@ -128,9 +209,9 @@ def interpolate(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
@@ -360,6 +441,7 @@ def _encode_prompt(
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]],
@@ -375,8 +457,9 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Args:
         Function invoked when calling the pipeline for generation.
+
+        Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
             strength (`float`, *optional*, defaults to 0.8):
@@ -411,7 +494,10 @@ def __call__(
                 (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        Examples: Returns:
+
+        Examples:
+
+        Returns:
             [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 

From 4c8c3ca88254de70bff1f8ba94d4cdad5fcbd574 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 03:38:54 +0000
Subject: [PATCH 24/51] style

---
 .../pipeline_kandinsky2_2_decoder.py          |  8 +--
 ...ipeline_kandinsky2_2_decoder_controlnet.py | 59 +++++++++++--------
 ...kandinsky2_2_decoder_controlnet_img2img.py | 59 +++++++++++--------
 .../pipeline_kandinsky2_2_decoder_img2img.py  |  2 +-
 ...ipeline_kandinsky2_2_decoder_inpainting.py | 10 ++--
 .../pipeline_kandinsky2_2_prior.py            |  3 +-
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 12 +++-
 7 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index d128e699a01e..4df77fe565d9 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -194,8 +194,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
@@ -227,9 +227,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index f6daee890e3a..e3c5317312f1 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -25,6 +25,7 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
@@ -40,49 +41,55 @@
         >>> from transformers import pipeline
         >>> from diffusers.utils import load_image
 
+
         >>> def make_hint(image, depth_estimator):
-        ...   image = depth_estimator(image)['depth']
-        ...   image = np.array(image)
-        ...   image = image[:, :, None]
-        ...   image = np.concatenate([image, image, image], axis=2)
-        ...   detected_map = torch.from_numpy(image).float() / 255.0
-        ...   hint = detected_map.permute(2, 0, 1)
-        ...   return hint
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
 
-        >>> depth_estimator = pipeline('depth-estimation')
+        >>> depth_estimator = pipeline("depth-estimation")
 
-        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior = pipe_prior.to("cuda")
 
-        >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+        >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
         >>> pipe = pipe.to("cuda")
 
 
         >>> img = load_image(
-        ...              "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...              "/kandinsky/cat.png"
-        ...         ).resize((768, 768))
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
 
-        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
 
-        >>> prompt = 'A robot, 4k photo'
-        >>> negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
 
-        >>> generator = torch.Generator(device='cuda').manual_seed(43)
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
 
         >>> image_emb, zero_image_emb = pipe_prior(
-        ...     prompt=prompt, 
-        ...     negative_prompt=negative_prior_prompt, 
-        ...     generator=generator).to_tuple()
+        ...     prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
+        ... ).to_tuple()
 
         >>> images = pipe(
-        ...     image_embeds=image_emb, 
-        ...     negative_image_embeds=zero_image_emb, 
-        ...     hint=hint, 
-        ...     num_inference_steps=50, 
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
         ...     generator=generator,
-        ...     height=768, 
-        ...     width=768).images
+        ...     height=768,
+        ...     width=768,
+        ... ).images
 
         >>> images[0].save("robot_cat.png")
         ```
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 147daa9a6214..656f0bd1e634 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -28,6 +28,7 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
@@ -43,49 +44,56 @@
         >>> from transformers import pipeline
         >>> from diffusers.utils import load_image
 
+
         >>> def make_hint(image, depth_estimator):
-        ...   image = depth_estimator(image)['depth']
-        ...   image = np.array(image)
-        ...   image = image[:, :, None]
-        ...   image = np.concatenate([image, image, image], axis=2)
-        ...   detected_map = torch.from_numpy(image).float() / 255.0
-        ...   hint = detected_map.permute(2, 0, 1)
-        ...   return hint
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
 
-        >>> depth_estimator = pipeline('depth-estimation')
+        >>> depth_estimator = pipeline("depth-estimation")
 
-        >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior = pipe_prior.to("cuda")
 
-        >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+        >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
         >>> pipe = pipe.to("cuda")
 
         >>> img = load_image(
-        ...              "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...             "/kandinsky/cat.png"
-        ...         ).resize((768, 768))
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
 
 
-        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
 
-        >>> prompt = 'A robot, 4k photo'
-        >>> negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
 
-        >>> generator = torch.Generator(device='cuda').manual_seed(43) 
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
 
         >>> img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
         >>> negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
 
         >>> images = pipe(
-        ...     image=img, 
-        ...     strength=0.5, 
-        ...     image_embeds=img_emb.image_embeds, 
-        ...     negative_image_embeds=negative_emb.image_embeds, 
-        ...     hint=hint, 
-        ...     num_inference_steps=50, 
+        ...     image=img,
+        ...     strength=0.5,
+        ...     image_embeds=img_emb.image_embeds,
+        ...     negative_image_embeds=negative_emb.image_embeds,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
         ...     generator=generator,
-        ...     height=768, 
-        ...     width=768).images
+        ...     height=768,
+        ...     width=768,
+        ... ).images
 
         >>> images[0].save("robot_cat.png")
         ```
@@ -282,7 +290,6 @@ def __call__(
         Function invoked when calling the pipeline for generation.
 
         Args:
-
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 39693b3da0cb..c0a8149f8a35 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -28,6 +28,7 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
@@ -263,7 +264,6 @@ def __call__(
         Function invoked when calling the pipeline for generation.
 
         Args:
-
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 88130a52eb9b..881974d1fc64 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -30,6 +30,7 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
@@ -238,7 +239,6 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -360,8 +360,8 @@ def __call__(
         return_dict: bool = True,
     ):
         """
-        Function invoked when calling the pipeline for generation.
         Args:
+        Function invoked when calling the pipeline for generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             image (`PIL.Image.Image`):
@@ -401,9 +401,9 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        
-        Examples: 
-        
+
+        Examples:
+
         Returns:
             [`~pipelines.ImagePipelineOutput`] or `tuple`
         """
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 4aced813af6e..8c92d6ae068a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -100,7 +100,7 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 
 class KandinskyV22PriorPipeline(DiffusionPipeline):
-    """    
+    """
     Pipeline for generating image prior for Kandinsky
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -159,6 +159,7 @@ def interpolate(
     ):
         """
         Function invoked when using the prior pipeline for interpolation.
+
         Args:
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
                 list of prompts and images to guide the image generation.
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 1b60c88e0089..aa930482d352 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -26,7 +26,9 @@
         >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "red cat, 4k photo"
@@ -36,7 +38,9 @@
         ... )
         >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple()
 
-        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16")
+        >>> pipe = KandinskyPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16"
+        ... )
         >>> pipe.to("cuda")
 
         >>> image = pipe(
@@ -80,7 +84,9 @@
         >>> weights = [0.3, 0.3, 0.4]
         >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 
-        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
         >>> pipe.to("cuda")
 
         >>> image = pipe(

From e737939c3456710e8eb3e1fad1f9b0bfbb460ff3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 05:23:19 +0000
Subject: [PATCH 25/51] update doc

---
 docs/source/en/api/pipelines/kandinsky.mdx | 195 ++++++++++++++++++++-
 1 file changed, 190 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index bf551249ef05..11aba14b3df2 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -22,8 +22,13 @@ The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene55
 | Pipeline | Tasks |
 |---|---|
 | [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
+| [pipeline_kandinsky2_2_decoder.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py) | *Text-to-Image Generation* |
 | [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_decoder_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpaint.py) | *Image-Guided Image Generation* |
 | [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_decoder_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_decoder_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_decoder_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py) | *Image-Guided Image Generation* |
 
 ## Usage example
 
@@ -135,6 +140,7 @@ prompt = "birds eye view of a quilted paper style alien planet landscape, vibran
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/alienplanet.png)
 
 
+
 ### Text Guided Image-to-Image Generation
 
 The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline.
@@ -283,6 +289,139 @@ image.save("starry_cat.png")
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png)
 
 
+### Text-to-Image Generation with ControlNet Conditioning
+
+In the following, we give a simple example of how to use [`KandinskyV22ControlnetPipeline`] to add control to the text-to-image generation with a depth image.
+
+First, let's take an image and extract its depth map.
+
+```python
+from diffusers.utils import load_image
+
+img = load_image(
+             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/cat.png"
+        ).resize((768, 768))
+
+```
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png)
+
+We can use `depth-estimation` pipeline from transformers to process the image and retrieve its depth map
+
+```python
+import torch
+import numpy as np
+
+from transformers import pipeline
+from diffusers.utils import load_image
+
+def make_hint(image, depth_estimator):
+  image = depth_estimator(image)['depth']
+  image = np.array(image)
+  image = image[:, :, None]
+  image = np.concatenate([image, image, image], axis=2)
+  detected_map = torch.from_numpy(image).float() / 255.0
+  hint = detected_map.permute(2, 0, 1)
+  return hint
+
+depth_estimator = pipeline('depth-estimation')
+hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+```
+Now, we load the prior pipeline and the text-to-image controlnet pipeline
+
+```python
+from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
+
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+pipe_prior = pipe_prior.to("cuda")
+
+pipe = KandinskyV22ControlnetPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+```
+
+We pass the prompt and negative prompt through the prior to generate image embeddings
+
+```python
+prompt = 'A robot, 4k photo'
+
+negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+
+generator = torch.Generator(device='cuda').manual_seed(43)
+image_emb, zero_image_emb = pipe_prior(
+    prompt=prompt, 
+    negative_prompt=negative_prior_prompt, 
+    generator=generator).to_tuple()
+```
+
+Now we can pass the image embeddings and the depth image we extracted to the controlnet pipeline. Remember that with Kandinsky 2.2, only prior pipelines accept `prompt` input. You do not need to pass the prompt to controlnet pipeline.
+
+```python
+images = pipe(
+    image_embeds=image_emb, 
+    negative_image_embeds=zero_image_emb, 
+    hint=hint, 
+    num_inference_steps=50, 
+    generator=generator,
+    height=768, 
+    width=768).images
+
+images[0].save("robot_cat.png")
+```
+
+the output image looks as follow:
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat_text2img.png)
+
+### Image-to-Image Generation with ControlNet Conditioning
+
+Kandinsky 2.2 also includes an [`KandinskyV22ControlnetImg2ImgPipeline`] that will allow you to add control to the image generation process with both the image and its depth map. This pipeline works really well with [`KandinskyV22PriorEmb2EmbPipeline`], which generates image embeddings based on both a text prompt and an image. 
+
+For our robot cat example, we will pass the prompt and cat image together to prior pipeline to generate an image embedding. We will then use that image embedding and the depth map of the cat to further control the image generation process. 
+
+We can use the same cat image and its depth map from the last example.
+
+```python
+import torch
+import numpy as np
+
+from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
+from transformers import pipeline
+
+pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+pipe_prior = pipe_prior.to("cuda")
+
+pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+
+prompt = 'A robot, 4k photo'
+negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+
+generator = torch.Generator(device='cuda').manual_seed(43)
+
+# run prior pipeline  
+
+img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
+negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
+
+# run controlnet img2img pipeline
+images = pipe(
+    image=img, 
+    strength=0.5, 
+    image_embeds=img_emb.image_embeds, 
+    negative_image_embeds=negative_emb.image_embeds, 
+    hint=hint, 
+    num_inference_steps=50, 
+    generator=generator,
+    height=768, 
+    width=768).images
+
+images[0].save("robot_cat.png")
+```
+
+Here is the output. Compared with the output from our text-to-image controlnet example, it kept a lot more cat facial details from the original images and worked into the robot style we asked for.
+
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat.png)
+
+
 ## Optimization
 
 Running Kandinsky in inference requires running both a first prior pipeline: [`KandinskyPriorPipeline`]
@@ -337,29 +476,75 @@ feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/d
 
 
 
+## Kandinsky 2.1
 
-
-## KandinskyPriorPipeline
+### KandinskyPriorPipeline
 
 [[autodoc]] KandinskyPriorPipeline
 	- all
 	- __call__
 	- interpolate
 	
-## KandinskyPipeline
+### KandinskyPipeline
 
 [[autodoc]] KandinskyPipeline
 	- all
 	- __call__
 
-## KandinskyImg2ImgPipeline
+### KandinskyImg2ImgPipeline
 
 [[autodoc]] KandinskyImg2ImgPipeline
 	- all
 	- __call__
 
-## KandinskyInpaintPipeline
+### KandinskyInpaintPipeline
 
 [[autodoc]] KandinskyInpaintPipeline
 	- all
 	- __call__
+
+## Kandinsky 2.2
+
+### KandinskyV22Pipeline
+
+[[autodoc]] KandinskyV22Pipeline
+	- all
+	- __call__
+
+### KandinskyV22ControlnetPipeline
+
+[[autodoc]] KandinskyV22ControlnetPipeline
+	- all
+	- __call__
+
+### KandinskyV22ControlnetImg2ImgPipeline
+
+[[autodoc]] KandinskyV22ControlnetImg2ImgPipeline
+	- all
+	- __call__
+
+### KandinskyV22Img2ImgPipeline
+
+[[autodoc]] KandinskyV22Img2ImgPipeline
+	- all
+	- __call__
+
+### KandinskyV22InpaintPipeline
+
+[[autodoc]] KandinskyV22InpaintPipeline
+	- all
+	- __call__
+
+### KandinskyV22PriorPipeline
+
+[[autodoc]] ## KandinskyV22PriorPipeline
+	- all
+	- __call__
+	- interpolate
+
+### KandinskyV22PriorEmb2EmbPipeline
+
+[[autodoc]] KandinskyV22PriorEmb2EmbPipeline
+	- all
+	- __call__
+	- interpolate

From 883a852e0ae40f8c4b836fe2c6c4e567620029ef Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 05:23:54 +0000
Subject: [PATCH 26/51] style

---
 docs/source/en/api/pipelines/kandinsky.mdx | 93 ++++++++++++----------
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 11aba14b3df2..6c30d2db8a05 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -299,10 +299,8 @@ First, let's take an image and extract its depth map.
 from diffusers.utils import load_image
 
 img = load_image(
-             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinsky/cat.png"
-        ).resize((768, 768))
-
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+).resize((768, 768))
 ```
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png)
 
@@ -315,55 +313,61 @@ import numpy as np
 from transformers import pipeline
 from diffusers.utils import load_image
 
+
 def make_hint(image, depth_estimator):
-  image = depth_estimator(image)['depth']
-  image = np.array(image)
-  image = image[:, :, None]
-  image = np.concatenate([image, image, image], axis=2)
-  detected_map = torch.from_numpy(image).float() / 255.0
-  hint = detected_map.permute(2, 0, 1)
-  return hint
-
-depth_estimator = pipeline('depth-estimation')
-hint = make_hint(img, depth_estimator).unsqueeze(0).half().to('cuda')
+    image = depth_estimator(image)["depth"]
+    image = np.array(image)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    detected_map = torch.from_numpy(image).float() / 255.0
+    hint = detected_map.permute(2, 0, 1)
+    return hint
+
+
+depth_estimator = pipeline("depth-estimation")
+hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
 ```
 Now, we load the prior pipeline and the text-to-image controlnet pipeline
 
 ```python
 from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
 
-pipe_prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+)
 pipe_prior = pipe_prior.to("cuda")
 
-pipe = KandinskyV22ControlnetPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+pipe = KandinskyV22ControlnetPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+)
 pipe = pipe.to("cuda")
 ```
 
 We pass the prompt and negative prompt through the prior to generate image embeddings
 
 ```python
-prompt = 'A robot, 4k photo'
+prompt = "A robot, 4k photo"
 
-negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
 
-generator = torch.Generator(device='cuda').manual_seed(43)
+generator = torch.Generator(device="cuda").manual_seed(43)
 image_emb, zero_image_emb = pipe_prior(
-    prompt=prompt, 
-    negative_prompt=negative_prior_prompt, 
-    generator=generator).to_tuple()
+    prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
+).to_tuple()
 ```
 
 Now we can pass the image embeddings and the depth image we extracted to the controlnet pipeline. Remember that with Kandinsky 2.2, only prior pipelines accept `prompt` input. You do not need to pass the prompt to controlnet pipeline.
 
 ```python
 images = pipe(
-    image_embeds=image_emb, 
-    negative_image_embeds=zero_image_emb, 
-    hint=hint, 
-    num_inference_steps=50, 
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    hint=hint,
+    num_inference_steps=50,
     generator=generator,
-    height=768, 
-    width=768).images
+    height=768,
+    width=768,
+).images
 
 images[0].save("robot_cat.png")
 ```
@@ -386,33 +390,38 @@ import numpy as np
 from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
 from transformers import pipeline
 
-pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior',torch_dtype=torch.float16)
+pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+)
 pipe_prior = pipe_prior.to("cuda")
 
-pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-controlnet-depth', torch_dtype=torch.float16)
+pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+)
 pipe = pipe.to("cuda")
 
-prompt = 'A robot, 4k photo'
-negative_prior_prompt ='lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature'
+prompt = "A robot, 4k photo"
+negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
 
-generator = torch.Generator(device='cuda').manual_seed(43)
+generator = torch.Generator(device="cuda").manual_seed(43)
 
-# run prior pipeline  
+# run prior pipeline
 
 img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
 negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
 
 # run controlnet img2img pipeline
 images = pipe(
-    image=img, 
-    strength=0.5, 
-    image_embeds=img_emb.image_embeds, 
-    negative_image_embeds=negative_emb.image_embeds, 
-    hint=hint, 
-    num_inference_steps=50, 
+    image=img,
+    strength=0.5,
+    image_embeds=img_emb.image_embeds,
+    negative_image_embeds=negative_emb.image_embeds,
+    hint=hint,
+    num_inference_steps=50,
     generator=generator,
-    height=768, 
-    width=768).images
+    height=768,
+    width=768,
+).images
 
 images[0].save("robot_cat.png")
 ```

From dda70dadc18d5ad14b0fa0ea682e0becf818da86 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 06:12:06 +0000
Subject: [PATCH 27/51] add a top about 2.2

---
 docs/source/en/api/pipelines/kandinsky.mdx | 25 +++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 6c30d2db8a05..c85e7c2fa8a5 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
+Kandinsky inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
 
 It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 
@@ -117,6 +117,29 @@ One cheeseburger monster coming up! Enjoy!
 
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
 
+<Tip warning={true}>
+
+The Kandinsky 2.2 text-to-image pipeline no longer accept the `prompt` input and will condition the image generation process with only `image_embeds` and `negative_image_embeds`. To use Kandinsky2.2 text-to-image pipeline, you need to create your pipelines with Kandinsky 2.2 checkpoints
+
+```python
+pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+t2i_pipe.to("cuda")
+```
+
+You can then use `pipe_prior` to generate image embeddings with exact same workflow as in 2.1, and pass these embeddings to the text-to-image pipeline. Remember that in 2.2 you don't need to pass the `prompt`.
+
+```python
+image = t2i_pipe(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[
+    0
+]
+image.save("cheeseburger_monster.png")
+```
+
+</Tip>
+
 The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts.
 
 ```python

From 03cf7268a1aff6f4a91bffcc338487e2b614d4b7 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 5 Jul 2023 15:26:34 -1000
Subject: [PATCH 28/51] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/unet_2d_condition.py     |  2 +-
 .../pipeline_kandinsky2_2_decoder.py          | 26 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index be3d4da8b46b..85935c08dfeb 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -361,7 +361,7 @@ def __init__(
             # Kadinsky 2.2
             self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type == "image_hint":
-            # Kadinsky 2.2
+            # Kadinsky 2.2 ControlNet
             self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index 4df77fe565d9..1a74d667407a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -57,14 +57,14 @@
 """
 
 
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
 
 
 class KandinskyV22Pipeline(DiffusionPipeline):
@@ -79,7 +79,7 @@ class KandinskyV22Pipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        vae ([`VQModel`]):
+        movq ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -87,14 +87,14 @@ def __init__(
         self,
         unet: UNet2DConditionModel,
         scheduler: DDPMScheduler,
-        vae: VQModel,
+        movq: VQModel,
     ):
         super().__init__()
 
         self.register_modules(
             unet=unet,
             scheduler=scheduler,
-            vae=vae,
+            movq=movq,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
@@ -124,7 +124,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.vae,
+            self.movq,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -254,7 +254,7 @@ def __call__(
 
         num_channels_latents = self.unet.config.in_channels
 
-        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.vae_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(

From 73d0f0bc16c5669fda402b17321245482940bbfb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 01:48:03 +0000
Subject: [PATCH 29/51] vae -> movq

---
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder.py         | 8 ++++----
 tests/pipelines/kandinsky_v22/test_kandinsky.py           | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
index 1a74d667407a..663216e4607f 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
@@ -96,7 +96,7 @@ def __init__(
             scheduler=scheduler,
             movq=movq,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -149,7 +149,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.unet, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -254,7 +254,7 @@ def __call__(
 
         num_channels_latents = self.unet.config.in_channels
 
-        height, width = downscale_height_and_width(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(
@@ -300,7 +300,7 @@ def __call__(
                 generator=generator,
             )[0]
         # post-processing
-        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index ac970455b41b..162c96d4b3e2 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -141,7 +141,7 @@ def get_dummy_components(self):
         components = {
             "unet": unet,
             "scheduler": scheduler,
-            "vae": movq,
+            "movq": movq,
         }
         return components
 

From 19e957489dcdaa8510bdcc9bb76c1de99d719919 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 02:42:06 +0000
Subject: [PATCH 30/51] vae -> movq

---
 ...ipeline_kandinsky2_2_decoder_controlnet.py | 33 ++++++++-------
 ...kandinsky2_2_decoder_controlnet_img2img.py | 41 +++++++++---------
 .../pipeline_kandinsky2_2_decoder_img2img.py  | 42 +++++++++----------
 ...ipeline_kandinsky2_2_decoder_inpainting.py | 38 ++++++++---------
 .../pipelines/kandinsky_v22/test_kandinsky.py |  4 +-
 .../test_kandinsky_controlnet.py              |  6 ++-
 .../test_kandinsky_controlnet_img2img.py      |  6 ++-
 .../kandinsky_v22/test_kandinsky_img2img.py   |  6 ++-
 .../kandinsky_v22/test_kandinsky_inpaint.py   |  6 ++-
 9 files changed, 97 insertions(+), 85 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index e3c5317312f1..17395a875ac8 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -96,14 +96,15 @@
 """
 
 
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
+# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
 
 
 class KandinskyV22ControlnetPipeline(DiffusionPipeline):
@@ -126,16 +127,16 @@ def __init__(
         self,
         unet: UNet2DConditionModel,
         scheduler: DDPMScheduler,
-        vae: VQModel,
+        movq: VQModel,
     ):
         super().__init__()
 
         self.register_modules(
             unet=unet,
             scheduler=scheduler,
-            vae=vae,
+            movq=movq,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -163,7 +164,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.vae,
+            self.movq,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -188,7 +189,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.unet, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -304,9 +305,9 @@ def __call__(
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
-        num_channels_latents = self.vae.config.latent_channels
+        num_channels_latents = self.movq.config.latent_channels
 
-        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(
@@ -352,7 +353,7 @@ def __call__(
                 generator=generator,
             )[0]
         # post-processing
-        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 656f0bd1e634..6ee48226bfea 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -100,14 +100,15 @@
 """
 
 
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
+# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
 
 
 def prepare_image(pil_image, w=512, h=512):
@@ -131,7 +132,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        vae ([`VQModel`]):
+        movq ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -139,16 +140,16 @@ def __init__(
         self,
         unet: UNet2DConditionModel,
         scheduler: DDPMScheduler,
-        vae: VQModel,
+        movq: VQModel,
     ):
         super().__init__()
 
         self.register_modules(
             unet=unet,
             scheduler=scheduler,
-            vae=vae,
+            movq=movq,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
@@ -181,13 +182,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
             elif isinstance(generator, list):
                 init_latents = [
-                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
-                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
 
-            init_latents = self.vae.config.scaling_factor * init_latents
+            init_latents = self.movq.config.scaling_factor * init_latents
 
         init_latents = torch.cat([init_latents], dim=0)
 
@@ -215,7 +216,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.vae,
+            self.movq,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -240,7 +241,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.unet, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -366,12 +367,12 @@ def __call__(
         image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
         image = image.to(dtype=image_embeds.dtype, device=device)
 
-        latents = self.vae.encode(image)["latents"]
+        latents = self.movq.encode(image)["latents"]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
         latents = self.prepare_latents(
             latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
         )
@@ -411,7 +412,7 @@ def __call__(
             )[0]
 
         # post-processing
-        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index c0a8149f8a35..694016611f60 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -73,15 +73,15 @@
         ```
 """
 
-
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
+# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
 
 
 def prepare_image(pil_image, w=512, h=512):
@@ -105,7 +105,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        vae ([`VQModel`]):
+        movq ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -113,16 +113,16 @@ def __init__(
         self,
         unet: UNet2DConditionModel,
         scheduler: DDPMScheduler,
-        vae: VQModel,
+        movq: VQModel,
     ):
         super().__init__()
 
         self.register_modules(
             unet=unet,
             scheduler=scheduler,
-            vae=vae,
+            movq=movq,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
@@ -155,13 +155,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
             elif isinstance(generator, list):
                 init_latents = [
-                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = torch.cat(init_latents, dim=0)
             else:
-                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
 
-            init_latents = self.vae.config.scaling_factor * init_latents
+            init_latents = self.movq.config.scaling_factor * init_latents
 
         init_latents = torch.cat([init_latents], dim=0)
 
@@ -190,7 +190,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.vae,
+            self.movq,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -215,7 +215,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.unet, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -333,12 +333,12 @@ def __call__(
         image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
         image = image.to(dtype=image_embeds.dtype, device=device)
 
-        latents = self.vae.encode(image)["latents"]
+        latents = self.movq.encode(image)["latents"]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
         latents = self.prepare_latents(
             latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
         )
@@ -377,7 +377,7 @@ def __call__(
             )[0]
 
         # post-processing
-        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 881974d1fc64..3fcb574e3d77 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -80,15 +80,15 @@
         ```
 """
 
-
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
+# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
 
 
 def prepare_mask(masks):
@@ -243,7 +243,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        vae ([`VQModel`]):
+        movq ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
 
@@ -251,16 +251,16 @@ def __init__(
         self,
         unet: UNet2DConditionModel,
         scheduler: DDPMScheduler,
-        vae: VQModel,
+        movq: VQModel,
     ):
         super().__init__()
 
         self.register_modules(
             unet=unet,
             scheduler=scheduler,
-            vae=vae,
+            movq=movq,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -288,7 +288,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.vae,
+            self.movq,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -313,7 +313,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.unet, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -430,7 +430,7 @@ def __call__(
         mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
 
         image = image.to(dtype=image_embeds.dtype, device=device)
-        image = self.vae.encode(image)["latents"]
+        image = self.movq.encode(image)["latents"]
 
         mask_image = mask_image.to(dtype=image_embeds.dtype, device=device)
 
@@ -449,9 +449,9 @@ def __call__(
             mask_image = mask_image.repeat(2, 1, 1, 1)
             masked_image = masked_image.repeat(2, 1, 1, 1)
 
-        num_channels_latents = self.vae.config.latent_channels
+        num_channels_latents = self.movq.config.latent_channels
 
-        height, width = get_new_h_w(height, width, self.vae_scale_factor)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(
@@ -509,7 +509,7 @@ def __call__(
             latents = init_mask * init_latents_proper + (1 - init_mask) * latents
         # post-processing
         latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
-        image = self.vae.decode(latents, force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index 162c96d4b3e2..bc79f5071857 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -223,7 +223,9 @@ def test_kandinsky_text2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Pipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            #"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
index 97e57de34c85..3fbec07f471e 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -146,7 +146,7 @@ def get_dummy_components(self):
         components = {
             "unet": unet,
             "scheduler": scheduler,
-            "vae": movq,
+            "movq": movq,
         }
         return components
 
@@ -240,7 +240,9 @@ def test_kandinsky_controlnet(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            #"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
index 3b717c68a3d0..26ff4c2ad9ed 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -149,7 +149,7 @@ def get_dummy_components(self):
         components = {
             "unet": unet,
             "scheduler": scheduler,
-            "vae": movq,
+            "movq": movq,
         }
 
         return components
@@ -254,7 +254,9 @@ def test_kandinsky_controlnet_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            #"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index 4261b80dd20b..60f1604ecaf3 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -151,7 +151,7 @@ def get_dummy_components(self):
         components = {
             "unet": unet,
             "scheduler": scheduler,
-            "vae": movq,
+            "movq": movq,
         }
 
         return components
@@ -244,7 +244,9 @@ def test_kandinsky_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            #"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
index 01692dac0bb4..c2051c670ae2 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -150,7 +150,7 @@ def get_dummy_components(self):
         components = {
             "unet": unet,
             "scheduler": scheduler,
-            "vae": movq,
+            "movq": movq,
         }
 
         return components
@@ -255,7 +255,9 @@ def test_kandinsky_inpaint(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22InpaintPipeline.from_pretrained(
-            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+            #"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)

From ce3f7c2c2eb4975df3ca6d06b902c553a77662c8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 02:43:14 +0000
Subject: [PATCH 31/51] style

---
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py  | 1 +
 .../pipeline_kandinsky2_2_decoder_inpainting.py            | 1 +
 tests/pipelines/kandinsky_v22/test_kandinsky.py            | 7 ++++---
 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py | 7 ++++---
 .../kandinsky_v22/test_kandinsky_controlnet_img2img.py     | 7 ++++---
 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py    | 7 ++++---
 tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py    | 7 ++++---
 7 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 694016611f60..3ff1de1b9807 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -73,6 +73,7 @@
         ```
 """
 
+
 # Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 3fcb574e3d77..1a45b806459a 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -80,6 +80,7 @@
         ```
 """
 
+
 # Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index bc79f5071857..fcecf15fa567 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -223,9 +223,10 @@ def test_kandinsky_text2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Pipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder", torch_dtype=torch.float16
-            #"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder",
+            torch_dtype=torch.float16
+            # "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
index 3fbec07f471e..b30210697bb1 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -240,9 +240,10 @@ def test_kandinsky_controlnet(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
-            #"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth",
+            torch_dtype=torch.float16
+            # "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
index 26ff4c2ad9ed..a917e3dbf901 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -254,9 +254,10 @@ def test_kandinsky_controlnet_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
-            #"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth",
+            torch_dtype=torch.float16
+            # "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index 60f1604ecaf3..9fc40fe7bc48 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -244,9 +244,10 @@ def test_kandinsky_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder", torch_dtype=torch.float16
-            #"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder",
+            torch_dtype=torch.float16
+            # "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
index c2051c670ae2..fe32f44a4a39 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -255,9 +255,10 @@ def test_kandinsky_inpaint(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22InpaintPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge 
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
-            #"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
+            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder-inpaint",
+            torch_dtype=torch.float16
+            # "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)

From 552ce7baeb7d8f5d64426980194d789b452fdea1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 02:52:23 +0000
Subject: [PATCH 32/51] fix the #copied from

---
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py    | 2 +-
 .../pipeline_kandinsky2_2_decoder_controlnet_img2img.py         | 2 +-
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py       | 2 +-
 .../kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py    | 2 +-
 .../pipelines/versatile_diffusion/modeling_text_unet.py         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
index 17395a875ac8..3e2d1f751e60 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
@@ -96,7 +96,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
index 6ee48226bfea..4d1acd562665 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
@@ -100,7 +100,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
index 3ff1de1b9807..cf05ef00cfbd 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
@@ -74,7 +74,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
index 1a45b806459a..2f9f5d5c746e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
@@ -81,7 +81,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipelines_kandinsky_2_2_decoder.KandinskyV22Pipeline.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 87d803e739b0..8dc7d5556b3a 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -466,7 +466,7 @@ def __init__(
             # Kadinsky 2.2
             self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type == "image_hint":
-            # Kadinsky 2.2
+            # Kadinsky 2.2 ControlNet
             self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")

From 69df159e79cdd4586e037190c3bd47e2965ac6b5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 03:02:54 +0000
Subject: [PATCH 33/51] remove decoder from file name

---
 docs/source/en/api/pipelines/kandinsky.mdx             | 10 +++++-----
 src/diffusers/pipelines/kandinsky2_2/__init__.py       | 10 +++++-----
 ...andinsky2_2_decoder.py => pipeline_kandinsky2_2.py} |  0
 ...ntrolnet.py => pipeline_kandinsky2_2_controlnet.py} |  2 +-
 ....py => pipeline_kandinsky2_2_controlnet_img2img.py} |  2 +-
 ...der_img2img.py => pipeline_kandinsky2_2_img2img.py} |  2 +-
 ...painting.py => pipeline_kandinsky2_2_inpainting.py} |  2 +-
 7 files changed, 14 insertions(+), 14 deletions(-)
 rename src/diffusers/pipelines/kandinsky2_2/{pipeline_kandinsky2_2_decoder.py => pipeline_kandinsky2_2.py} (100%)
 rename src/diffusers/pipelines/kandinsky2_2/{pipeline_kandinsky2_2_decoder_controlnet.py => pipeline_kandinsky2_2_controlnet.py} (99%)
 rename src/diffusers/pipelines/kandinsky2_2/{pipeline_kandinsky2_2_decoder_controlnet_img2img.py => pipeline_kandinsky2_2_controlnet_img2img.py} (99%)
 rename src/diffusers/pipelines/kandinsky2_2/{pipeline_kandinsky2_2_decoder_img2img.py => pipeline_kandinsky2_2_img2img.py} (99%)
 rename src/diffusers/pipelines/kandinsky2_2/{pipeline_kandinsky2_2_decoder_inpainting.py => pipeline_kandinsky2_2_inpainting.py} (99%)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index c85e7c2fa8a5..e05dfca521b5 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -22,13 +22,13 @@ The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene55
 | Pipeline | Tasks |
 |---|---|
 | [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
-| [pipeline_kandinsky2_2_decoder.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py) | *Text-to-Image Generation* |
+| [pipeline_kandinsky2_2.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py) | *Text-to-Image Generation* |
 | [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_decoder_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpaint.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py) | *Image-Guided Image Generation* |
 | [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_decoder_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_decoder_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_decoder_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py) | *Image-Guided Image Generation* |
 
 ## Usage example
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index c94725d4ae62..648164b9f1ba 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -1,7 +1,7 @@
-from .pipeline_kandinsky2_2_decoder import KandinskyV22Pipeline
-from .pipeline_kandinsky2_2_decoder_controlnet import KandinskyV22ControlnetPipeline
-from .pipeline_kandinsky2_2_decoder_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
-from .pipeline_kandinsky2_2_decoder_img2img import KandinskyV22Img2ImgPipeline
-from .pipeline_kandinsky2_2_decoder_inpainting import KandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline
+from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
 from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
 from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
similarity index 100%
rename from src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder.py
rename to src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
similarity index 99%
rename from src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
rename to src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index 3e2d1f751e60..aeef2a7a93fc 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -96,7 +96,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
similarity index 99%
rename from src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
rename to src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 4d1acd562665..da412a414e6f 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -100,7 +100,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
similarity index 99%
rename from src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
rename to src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index cf05ef00cfbd..f52ec2c66ef1 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -74,7 +74,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
similarity index 99%
rename from src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
rename to src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 2f9f5d5c746e..9a7e00c380d6 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_decoder_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -81,7 +81,7 @@
 """
 
 
-# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_decoder.downscale_height_and_width
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
 def downscale_height_and_width(height, width, scale_factor=8):
     new_height = height // scale_factor**2
     if height % scale_factor**2 != 0:

From 30c0c9f848ad36fa4fccede93454ca94b71a91ed Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:01:20 +0000
Subject: [PATCH 34/51] update doc: add a section for kandinsky 2.2

---
 docs/source/en/api/pipelines/kandinsky.mdx | 88 +++++++++++++---------
 1 file changed, 51 insertions(+), 37 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index e05dfca521b5..55d25c69998f 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -17,18 +17,6 @@ It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for enco
 
 The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
 
-## Available Pipelines:
-
-| Pipeline | Tasks |
-|---|---|
-| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
-| [pipeline_kandinsky2_2.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py) | *Text-to-Image Generation* |
-| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky2_2_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py) | *Image-Guided Image Generation* |
 
 ## Usage example
 
@@ -117,29 +105,6 @@ One cheeseburger monster coming up! Enjoy!
 
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
 
-<Tip warning={true}>
-
-The Kandinsky 2.2 text-to-image pipeline no longer accept the `prompt` input and will condition the image generation process with only `image_embeds` and `negative_image_embeds`. To use Kandinsky2.2 text-to-image pipeline, you need to create your pipelines with Kandinsky 2.2 checkpoints
-
-```python
-pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16)
-pipe_prior.to("cuda")
-
-t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
-t2i_pipe.to("cuda")
-```
-
-You can then use `pipe_prior` to generate image embeddings with exact same workflow as in 2.1, and pass these embeddings to the text-to-image pipeline. Remember that in 2.2 you don't need to pass the `prompt`.
-
-```python
-image = t2i_pipe(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[
-    0
-]
-image.save("cheeseburger_monster.png")
-```
-
-</Tip>
-
 The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts.
 
 ```python
@@ -453,6 +418,46 @@ Here is the output. Compared with the output from our text-to-image controlnet e
 
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat.png)
 
+## Kandinsky 2.2
+
+The Kandinsky 2.2 release includes robust new text-to-image models that support text-to-image generation, image-to-image generation, image interpolation, and text-guided image inpainting. The general workflow to perform these tasks using Kandinsky 2.2 is the same as Kandinsky 2.1. First, you will need to use a prior pipeline to generate image embedding based on your text prompt and then use one of the image decoding pipelines to generate the image. The only difference is that in Kandinsky 2.2, all of the decoding pipelines no longer accept the `prompt` input, and the image generation process is conditioned with only `image_embeds` and `negative_image_embeds`.
+
+Let's look at an example of how to perform text-to-image generation using Kandinsky 2.2.
+
+First, let's create the prior pipeline and text-to-image pipeline with Kandinsky 2.2 checkpoints.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
+t2i_pipe.to("cuda")
+```
+
+You can then use `pipe_prior` to generate image embeddings the same way as you would with kandinsky 2.1
+
+```python
+prompt = "portrait of a women, blue eyes, cinematic"
+negative_prompt = "low quality, bad quality"
+
+image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple()
+```
+
+Now you can pass these embeddings to the text-to-image pipeline. Remember that in 2.2 you don't need to pass the `prompt`.
+
+```
+image = t2i_pipe(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[
+    0
+]
+image.save("portrait.png")
+```
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/%20blue%20eyes.png)
+
+We used the text-to-image pipeline as an example, but the same process applies to all decoding pipelines in Kandinsky 2.2. For more information, please refer to our API section for each pipeline.
+
 
 ## Optimization
 
@@ -506,9 +511,19 @@ t2i_pipe.unet = torch.compile(t2i_pipe.unet, mode="reduce-overhead", fullgraph=T
 After compilation you should see a very fast inference time. For more information,
 feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0).
 
+## Available Pipelines:
 
+| Pipeline | Tasks |
+|---|---|
+| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
+| [pipeline_kandinsky2_2.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py) | *Text-to-Image Generation* |
+| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky2_2_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py) | *Image-Guided Image Generation* |
 
-## Kandinsky 2.1
 
 ### KandinskyPriorPipeline
 
@@ -535,7 +550,6 @@ feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/d
 	- all
 	- __call__
 
-## Kandinsky 2.2
 
 ### KandinskyV22Pipeline
 

From 307de02492dc6d892288c30abcf3eb7af660c3d1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:30:06 +0000
Subject: [PATCH 35/51] fix

---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 src/diffusers/models/unet_2d_condition.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 55d25c69998f..91210fd78b59 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -360,7 +360,7 @@ images = pipe(
 images[0].save("robot_cat.png")
 ```
 
-the output image looks as follow:
+The output image looks as follow:
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat_text2img.png)
 
 ### Image-to-Image Generation with ControlNet Conditioning
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 85935c08dfeb..8c761cec3d36 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -819,7 +819,7 @@ def forward(
             # Kadinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
                 raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
                 )
             image_embs = added_cond_kwargs.get("image_embeds")
             hint = added_cond_kwargs.get("hint")

From 80d85d59457ab9e88a63780832c013a7734316d1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:33:41 +0000
Subject: [PATCH 36/51] fix-copies

---
 .../pipelines/versatile_diffusion/modeling_text_unet.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 8dc7d5556b3a..660fa7855fb5 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -926,8 +926,8 @@ def forward(
             # Kadinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
                 raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the"
-                    " keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires"
+                    " the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
                 )
             image_embs = added_cond_kwargs.get("image_embeds")
             hint = added_cond_kwargs.get("hint")

From 453fed28d57d08c3f0270f9759979ff615f2f771 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:47:04 +0000
Subject: [PATCH 37/51] add coped from

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py       | 1 +
 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py | 1 +
 src/diffusers/schedulers/scheduling_unclip.py                 | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 7b3537ea6895..489f59b3ad9a 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -115,6 +115,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
index 663216e4607f..59a2e7ccc837 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -98,6 +98,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 726c05a872e4..d7f927658c8a 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -308,6 +308,7 @@ def step(
 
         return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
 
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
     def add_noise(
         self,
         original_samples: torch.FloatTensor,

From 6959e60f5d37c55dce39d0211e060da794fbd237 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:55:11 +0000
Subject: [PATCH 38/51] add copies from for prior

---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py          | 1 +
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py    | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index a0208d5858b1..a35f406e0a05 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -274,6 +274,7 @@ def interpolate(
 
         return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 8c92d6ae068a..b2e8758bc170 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -249,6 +249,7 @@ def interpolate(
 
         return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -260,6 +261,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
     def get_zero_embed(self, batch_size=1, device=None):
         device = device or self.device
         zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
@@ -269,6 +271,7 @@ def get_zero_embed(self, batch_size=1, device=None):
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -291,6 +294,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
                 cpu_offload(cpu_offloaded_model, device)
 
     @property
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
@@ -308,6 +312,7 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
     def _encode_prompt(
         self,
         prompt,

From 7bfe3e783de9781de0dacded466689902cfc31fc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 05:59:17 +0000
Subject: [PATCH 39/51] add copies from for prior emb2emb

---
 .../kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index aa930482d352..e442289b6fc5 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -301,6 +301,7 @@ def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtyp
 
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
     def get_zero_embed(self, batch_size=1, device=None):
         device = device or self.device
         zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
@@ -310,6 +311,7 @@ def get_zero_embed(self, batch_size=1, device=None):
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
@@ -332,6 +334,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
                 cpu_offload(cpu_offloaded_model, device)
 
     @property
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
@@ -349,6 +352,7 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
     def _encode_prompt(
         self,
         prompt,

From 81c5c7751fb967ad7ef9be56709543a9b7c1fdf3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 06:17:58 +0000
Subject: [PATCH 40/51] copy from for img2img

---
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index f52ec2c66ef1..96ab9c2ed0f1 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -85,6 +85,7 @@ def downscale_height_and_width(height, width, scale_factor=8):
     return new_height * scale_factor, new_width * scale_factor
 
 
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
 def prepare_image(pil_image, w=512, h=512):
     pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
     arr = np.array(pil_image.convert("RGB"))

From 39a49db7a5fc90958505b8b9cda101114f28a490 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 06:28:38 +0000
Subject: [PATCH 41/51] copied from for inpaint

---
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py        | 1 +
 .../kandinsky2_2/pipeline_kandinsky2_2_inpainting.py         | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 04810ddb6e0a..75e58386cff0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -275,6 +275,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 9a7e00c380d6..ef6d353dd665 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -92,6 +92,7 @@ def downscale_height_and_width(height, width, scale_factor=8):
     return new_height * scale_factor, new_width * scale_factor
 
 
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_masks
 def prepare_mask(masks):
     prepared_masks = []
     for mask in masks:
@@ -116,6 +117,7 @@ def prepare_mask(masks):
     return torch.stack(prepared_masks, dim=0)
 
 
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask_and_masked_image
 def prepare_mask_and_masked_image(image, mask, height, width):
     r"""
     Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
@@ -262,7 +264,8 @@ def __init__(
             movq=movq,
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
-
+   
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)

From 9586192a41e2cf13543c99051c0226a36f16142d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 06:44:40 +0000
Subject: [PATCH 42/51] more copied from

---
 .../kandinsky2_2/pipeline_kandinsky2_2_controlnet.py          | 1 +
 .../kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py  | 4 ++++
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py   | 1 +
 .../kandinsky2_2/pipeline_kandinsky2_2_inpainting.py          | 4 ++--
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index aeef2a7a93fc..2c5f3019d662 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -138,6 +138,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index da412a414e6f..849560420770 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -111,6 +111,7 @@ def downscale_height_and_width(height, width, scale_factor=8):
     return new_height * scale_factor, new_width * scale_factor
 
 
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
 def prepare_image(pil_image, w=512, h=512):
     pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
     arr = np.array(pil_image.convert("RGB"))
@@ -151,6 +152,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
@@ -160,6 +162,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
 
         return timesteps, num_inference_steps - t_start
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_img2img.KandinskyV22Img2ImgPipeline.prepare_latents
     def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
@@ -197,6 +200,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
         latents = init_latents
 
         return latents
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 96ab9c2ed0f1..95a4b89a34cc 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -126,6 +126,7 @@ def __init__(
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index ef6d353dd665..8cd8df4de61d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -92,7 +92,7 @@ def downscale_height_and_width(height, width, scale_factor=8):
     return new_height * scale_factor, new_width * scale_factor
 
 
-# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_masks
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask
 def prepare_mask(masks):
     prepared_masks = []
     for mask in masks:
@@ -264,7 +264,7 @@ def __init__(
             movq=movq,
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
-   
+
     # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:

From 16440d846f30ec02c643a76f3180f328dfaaeb00 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 06:49:00 +0000
Subject: [PATCH 43/51] more copies from

---
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py   | 1 +
 .../kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py     | 1 +
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py      | 1 +
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index 2c5f3019d662..0da05ada003d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -150,6 +150,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 849560420770..f9b3103f1205 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -205,6 +205,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 95a4b89a34cc..ae7c2a92fa7d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -178,6 +178,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 8cd8df4de61d..d169dadccaa9 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -277,6 +277,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's

From 145ef684b331d20d3ac62f6c4d4cbf8165ad5c93 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 06:54:14 +0000
Subject: [PATCH 44/51] more copies

---
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py   | 1 +
 .../kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py     | 1 +
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py      | 1 +
 .../pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index 0da05ada003d..65210fc2ce1c 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -172,6 +172,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index f9b3103f1205..353935926fb0 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -227,6 +227,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index ae7c2a92fa7d..6274f57b544d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -200,6 +200,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index d169dadccaa9..3f07c5a27491 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -299,6 +299,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared

From ff1a2048091d9798c1d8c4c8394fde9c147b70cb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 08:23:29 +0000
Subject: [PATCH 45/51] remove the yiyi comments

---
 tests/pipelines/kandinsky_v22/test_kandinsky.py              | 5 +----
 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py   | 5 +----
 .../kandinsky_v22/test_kandinsky_controlnet_img2img.py       | 5 +----
 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py      | 5 +----
 tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py      | 5 +----
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index fcecf15fa567..162c96d4b3e2 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -223,10 +223,7 @@ def test_kandinsky_text2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Pipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder",
-            torch_dtype=torch.float16
-            # "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
index b30210697bb1..a50bdb50a47b 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -240,10 +240,7 @@ def test_kandinsky_controlnet(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth",
-            torch_dtype=torch.float16
-            # "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
index a917e3dbf901..9ff2936cbd72 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -254,10 +254,7 @@ def test_kandinsky_controlnet_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-controlnet-depth",
-            torch_dtype=torch.float16
-            # "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+            "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index 9fc40fe7bc48..069854325fd4 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -244,10 +244,7 @@ def test_kandinsky_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder",
-            torch_dtype=torch.float16
-            # "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
index fe32f44a4a39..9be3993acc6f 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -255,10 +255,7 @@ def test_kandinsky_inpaint(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyV22InpaintPipeline.from_pretrained(
-            # YiYi notes: for testing only, remember to send a PR to kandinsky repo and remove this before merge
-            "/home/yiyi_huggingface_co/model_repo/kandinsky-2-2-decoder-inpaint",
-            torch_dtype=torch.float16
-            # "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
         )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)

From 060488e7904eaaa207865b2e8309eab35be594cc Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 13:28:00 +0200
Subject: [PATCH 46/51] Apply suggestions from code review

---
 docs/source/en/api/pipelines/kandinsky.mdx    | 22 +++++++++----------
 src/diffusers/models/unet_2d_condition.py     | 14 ++++++------
 .../kandinsky2_2/pipeline_kandinsky2_2.py     |  3 ---
 .../pipeline_kandinsky2_2_controlnet.py       |  3 ---
 ...ipeline_kandinsky2_2_controlnet_img2img.py |  3 ---
 .../pipeline_kandinsky2_2_img2img.py          |  3 ---
 .../pipeline_kandinsky2_2_inpainting.py       |  3 ---
 .../versatile_diffusion/modeling_text_unet.py | 14 ++++++------
 8 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 91210fd78b59..a722a4ba36c6 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -11,11 +11,11 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-Kandinsky inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
+Kandinsky inherits best practices from [DALL-E 2](https://huggingface.co/papers/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
 
 It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 
-The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
+The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov). The original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
 
 
 ## Usage example
@@ -287,12 +287,12 @@ First, let's take an image and extract its depth map.
 from diffusers.utils import load_image
 
 img = load_image(
-    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png"
 ).resize((768, 768))
 ```
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png)
 
-We can use `depth-estimation` pipeline from transformers to process the image and retrieve its depth map
+We can use the `depth-estimation` pipeline from transformers to process the image and retrieve its depth map.
 
 ```python
 import torch
@@ -344,7 +344,7 @@ image_emb, zero_image_emb = pipe_prior(
 ).to_tuple()
 ```
 
-Now we can pass the image embeddings and the depth image we extracted to the controlnet pipeline. Remember that with Kandinsky 2.2, only prior pipelines accept `prompt` input. You do not need to pass the prompt to controlnet pipeline.
+Now we can pass the image embeddings and the depth image we extracted to the controlnet pipeline. With Kandinsky 2.2, only prior pipelines accept `prompt` input. You do not need to pass the prompt to the controlnet pipeline.
 
 ```python
 images = pipe(
@@ -365,9 +365,9 @@ The output image looks as follow:
 
 ### Image-to-Image Generation with ControlNet Conditioning
 
-Kandinsky 2.2 also includes an [`KandinskyV22ControlnetImg2ImgPipeline`] that will allow you to add control to the image generation process with both the image and its depth map. This pipeline works really well with [`KandinskyV22PriorEmb2EmbPipeline`], which generates image embeddings based on both a text prompt and an image. 
+Kandinsky 2.2 also includes a [`KandinskyV22ControlnetImg2ImgPipeline`] that will allow you to add control to the image generation process with both the image and its depth map. This pipeline works really well with [`KandinskyV22PriorEmb2EmbPipeline`], which generates image embeddings based on both a text prompt and an image. 
 
-For our robot cat example, we will pass the prompt and cat image together to prior pipeline to generate an image embedding. We will then use that image embedding and the depth map of the cat to further control the image generation process. 
+For our robot cat example, we will pass the prompt and cat image together to the prior pipeline to generate an image embedding. We will then use that image embedding and the depth map of the cat to further control the image generation process. 
 
 We can use the same cat image and its depth map from the last example.
 
@@ -414,13 +414,13 @@ images = pipe(
 images[0].save("robot_cat.png")
 ```
 
-Here is the output. Compared with the output from our text-to-image controlnet example, it kept a lot more cat facial details from the original images and worked into the robot style we asked for.
+Here is the output. Compared with the output from our text-to-image controlnet example, it kept a lot more cat facial details from the original image and worked into the robot style we asked for.
 
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat.png)
 
 ## Kandinsky 2.2
 
-The Kandinsky 2.2 release includes robust new text-to-image models that support text-to-image generation, image-to-image generation, image interpolation, and text-guided image inpainting. The general workflow to perform these tasks using Kandinsky 2.2 is the same as Kandinsky 2.1. First, you will need to use a prior pipeline to generate image embedding based on your text prompt and then use one of the image decoding pipelines to generate the image. The only difference is that in Kandinsky 2.2, all of the decoding pipelines no longer accept the `prompt` input, and the image generation process is conditioned with only `image_embeds` and `negative_image_embeds`.
+The Kandinsky 2.2 release includes robust new text-to-image models that support text-to-image generation, image-to-image generation, image interpolation, and text-guided image inpainting. The general workflow to perform these tasks using Kandinsky 2.2 is the same as in Kandinsky 2.1. First, you will need to use a prior pipeline to generate image embeddings based on your text prompt, and then use one of the image decoding pipelines to generate the output image. The only difference is that in Kandinsky 2.2, all of the decoding pipelines no longer accept the `prompt` input, and the image generation process is conditioned with only `image_embeds` and `negative_image_embeds`.
 
 Let's look at an example of how to perform text-to-image generation using Kandinsky 2.2.
 
@@ -437,7 +437,7 @@ t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-
 t2i_pipe.to("cuda")
 ```
 
-You can then use `pipe_prior` to generate image embeddings the same way as you would with kandinsky 2.1
+You can then use `pipe_prior` to generate image embeddings.
 
 ```python
 prompt = "portrait of a women, blue eyes, cinematic"
@@ -446,7 +446,7 @@ negative_prompt = "low quality, bad quality"
 image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple()
 ```
 
-Now you can pass these embeddings to the text-to-image pipeline. Remember that in 2.2 you don't need to pass the `prompt`.
+Now you can pass these embeddings to the text-to-image pipeline. When using Kandinsky 2.2 you don't need to pass the `prompt` (but you do with the previous version, Kandinsky 2.1).
 
 ```
 image = t2i_pipe(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 8c761cec3d36..2a25b6933bd3 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -300,7 +300,7 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
             )
         elif encoder_hid_dim_type == "image_proj":
-            # Kadinsky 2.2
+            # Kandinsky 2.2
             self.encoder_hid_proj = ImageProjection(
                 image_embed_dim=encoder_hid_dim,
                 cross_attention_dim=cross_attention_dim,
@@ -358,10 +358,10 @@ def __init__(
                 text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
             )
         elif addition_embed_type == "image":
-            # Kadinsky 2.2
+            # Kandinsky 2.2
             self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type == "image_hint":
-            # Kadinsky 2.2 ControlNet
+            # Kandinsky 2.2 ControlNet
             self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
@@ -806,7 +806,7 @@ def forward(
             aug_emb = self.add_embedding(text_embs, image_embs)
             emb = emb + aug_emb
         elif self.config.addition_embed_type == "image":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
@@ -816,10 +816,10 @@ def forward(
             aug_emb = self.add_embedding(image_embs)
             emb = emb + aug_emb
         elif self.config.addition_embed_type == "image_hint":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
                 raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
                 )
             image_embs = added_cond_kwargs.get("image_embeds")
             hint = added_cond_kwargs.get("hint")
@@ -841,7 +841,7 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
index 59a2e7ccc837..4a116e1e600b 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -153,9 +153,6 @@ def enable_model_cpu_offload(self, gpu_id=0):
         for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index 65210fc2ce1c..73fc20b5e0f2 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -195,9 +195,6 @@ def enable_model_cpu_offload(self, gpu_id=0):
         for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 353935926fb0..3e001e89e490 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -250,9 +250,6 @@ def enable_model_cpu_offload(self, gpu_id=0):
         for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 6274f57b544d..0a5f77b0ff2d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -223,9 +223,6 @@ def enable_model_cpu_offload(self, gpu_id=0):
         for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 3f07c5a27491..151312979f81 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -322,9 +322,6 @@ def enable_model_cpu_offload(self, gpu_id=0):
         for cpu_offloaded_model in [self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 660fa7855fb5..577c609be332 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -405,7 +405,7 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
             )
         elif encoder_hid_dim_type == "image_proj":
-            # Kadinsky 2.2
+            # Kandinsky 2.2
             self.encoder_hid_proj = ImageProjection(
                 image_embed_dim=encoder_hid_dim,
                 cross_attention_dim=cross_attention_dim,
@@ -463,10 +463,10 @@ def __init__(
                 text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
             )
         elif addition_embed_type == "image":
-            # Kadinsky 2.2
+            # Kandinsky 2.2
             self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type == "image_hint":
-            # Kadinsky 2.2 ControlNet
+            # Kandinsky 2.2 ControlNet
             self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
@@ -912,7 +912,7 @@ def forward(
             aug_emb = self.add_embedding(text_embs, image_embs)
             emb = emb + aug_emb
         elif self.config.addition_embed_type == "image":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the"
@@ -923,11 +923,11 @@ def forward(
             aug_emb = self.add_embedding(image_embs)
             emb = emb + aug_emb
         elif self.config.addition_embed_type == "image_hint":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires"
-                    " the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                    " the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
                 )
             image_embs = added_cond_kwargs.get("image_embeds")
             hint = added_cond_kwargs.get("hint")
@@ -950,7 +950,7 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kadinsky 2.2 - style
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires"

From fb3d0bb7c4d93de5ae5599e0dd32868808907fd7 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 13:59:07 +0200
Subject: [PATCH 47/51] Self-contained example, pipeline order

---
 docs/source/en/api/pipelines/kandinsky.mdx | 75 +++++++++++++---------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index a722a4ba36c6..85aa212f2327 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -376,8 +376,26 @@ import torch
 import numpy as np
 
 from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
+from diffusers.utils import load_image
 from transformers import pipeline
 
+img = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+    "/kandinskyv22/cat.png"
+).resize((768, 768))
+
+def make_hint(image, depth_estimator):
+    image = depth_estimator(image)["depth"]
+    image = np.array(image)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    detected_map = torch.from_numpy(image).float() / 255.0
+    hint = detected_map.permute(2, 0, 1)
+    return hint
+
+depth_estimator = pipeline("depth-estimation")
+hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
+
 pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
 )
@@ -515,42 +533,16 @@ feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/d
 
 | Pipeline | Tasks |
 |---|---|
-| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
 | [pipeline_kandinsky2_2.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py) | *Text-to-Image Generation* |
-| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* |
 | [pipeline_kandinsky2_2_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py) | *Image-Guided Image Generation* |
-| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* |
 | [pipeline_kandinsky2_2_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py) | *Image-Guided Image Generation* |
+| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* |
 | [pipeline_kandinsky2_2_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py) | *Image-Guided Image Generation* |
 | [pipeline_kandinsky2_2_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py) | *Image-Guided Image Generation* |
 
 
-### KandinskyPriorPipeline
-
-[[autodoc]] KandinskyPriorPipeline
-	- all
-	- __call__
-	- interpolate
-	
-### KandinskyPipeline
-
-[[autodoc]] KandinskyPipeline
-	- all
-	- __call__
-
-### KandinskyImg2ImgPipeline
-
-[[autodoc]] KandinskyImg2ImgPipeline
-	- all
-	- __call__
-
-### KandinskyInpaintPipeline
-
-[[autodoc]] KandinskyInpaintPipeline
-	- all
-	- __call__
-
-
 ### KandinskyV22Pipeline
 
 [[autodoc]] KandinskyV22Pipeline
@@ -594,3 +586,28 @@ feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/d
 	- all
 	- __call__
 	- interpolate
+
+### KandinskyPriorPipeline
+
+[[autodoc]] KandinskyPriorPipeline
+	- all
+	- __call__
+	- interpolate
+	
+### KandinskyPipeline
+
+[[autodoc]] KandinskyPipeline
+	- all
+	- __call__
+
+### KandinskyImg2ImgPipeline
+
+[[autodoc]] KandinskyImg2ImgPipeline
+	- all
+	- __call__
+
+### KandinskyInpaintPipeline
+
+[[autodoc]] KandinskyInpaintPipeline
+	- all
+	- __call__

From 6d5e70dcd0d95d97808b1d4e81d801a080828003 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:00:53 +0200
Subject: [PATCH 48/51] Import prior output instead of redefining.

---
 src/diffusers/pipelines/kandinsky/__init__.py |  2 +-
 .../pipeline_kandinsky2_2_prior.py            | 19 +-----------------
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 20 +------------------
 3 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index c8eecba0c7f2..242ff799e529 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -15,5 +15,5 @@
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
-    from .pipeline_kandinsky_prior import KandinskyPriorPipeline
+    from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput
     from .text_encoder import MultilingualCLIP
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index b2e8758bc170..3b9974a5dd70 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -1,7 +1,5 @@
-from dataclasses import dataclass
 from typing import List, Optional, Union
 
-import numpy as np
 import PIL
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -10,12 +8,12 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    BaseOutput,
     is_accelerate_available,
     logging,
     randn_tensor,
     replace_example_docstring,
 )
+from ..kandinsky import KandinskyPriorPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -84,21 +82,6 @@
 """
 
 
-@dataclass
-class KandinskyPriorPipelineOutput(BaseOutput):
-    """
-    Args:
-    Output class for KandinskyPriorPipeline.
-        image_embeds (`torch.FloatTensor`)
-            clip image embeddings for text prompt
-        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
-            clip image embeddings for unconditional tokens
-    """
-
-    image_embeds: Union[torch.FloatTensor, np.ndarray]
-    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
-
-
 class KandinskyV22PriorPipeline(DiffusionPipeline):
     """
     Pipeline for generating image prior for Kandinsky
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index e442289b6fc5..ae46af9c4551 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -1,7 +1,5 @@
-from dataclasses import dataclass
 from typing import List, Optional, Union
 
-import numpy as np
 import PIL
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -10,12 +8,12 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    BaseOutput,
     is_accelerate_available,
     logging,
     randn_tensor,
     replace_example_docstring,
 )
+from ..kandinsky import KandinskyPriorPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -102,22 +100,6 @@
 """
 
 
-@dataclass
-class KandinskyPriorPipelineOutput(BaseOutput):
-    """
-    Output class for KandinskyPriorPipeline.
-
-    Args:
-        image_embeds (`torch.FloatTensor`)
-            clip image embeddings for text prompt
-        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
-            clip image embeddings for unconditional tokens
-    """
-
-    image_embeds: Union[torch.FloatTensor, np.ndarray]
-    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
-
-
 class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
     """
     Pipeline for generating image prior for Kandinsky

From 6a06ed477ecfdad90979595a87158efce460e52e Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:02:39 +0200
Subject: [PATCH 49/51] Style

---
 docs/source/en/api/pipelines/kandinsky.mdx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 85aa212f2327..6b6c64a08951 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -380,10 +380,10 @@ from diffusers.utils import load_image
 from transformers import pipeline
 
 img = load_image(
-    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-    "/kandinskyv22/cat.png"
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinskyv22/cat.png"
 ).resize((768, 768))
 
+
 def make_hint(image, depth_estimator):
     image = depth_estimator(image)["depth"]
     image = np.array(image)
@@ -393,6 +393,7 @@ def make_hint(image, depth_estimator):
     hint = detected_map.permute(2, 0, 1)
     return hint
 
+
 depth_estimator = pipeline("depth-estimation")
 hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
 

From 00c4981a7f98292189cbac38bd001799813b6c0c Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:02:48 +0200
Subject: [PATCH 50/51] Make VQModel compatible with model offload.

---
 src/diffusers/models/vq_model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 73158294ee6e..bd98f5844127 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -18,7 +18,7 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
+from ..utils import BaseOutput, apply_forward_hook
 from .modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 
@@ -116,6 +116,7 @@ def __init__(
             norm_type=norm_type,
         )
 
+    @apply_forward_hook
     def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
         h = self.encoder(x)
         h = self.quant_conv(h)
@@ -125,6 +126,7 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut
 
         return VQEncoderOutput(latents=h)
 
+    @apply_forward_hook
     def decode(
         self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True
     ) -> Union[DecoderOutput, torch.FloatTensor]:

From 63e779579ff176fa5f8ba47821e9e97f679e63a0 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:51:07 +0200
Subject: [PATCH 51/51] Fix copies

---
 src/diffusers/models/unet_2d_condition.py                       | 2 +-
 .../pipelines/versatile_diffusion/modeling_text_unet.py         | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index ed73e231a2e2..1f1d69c6042e 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -821,7 +821,7 @@ def forward(
         if self.config.addition_embed_type == "text":
             aug_emb = self.add_embedding(encoder_hidden_states)
         elif self.config.addition_embed_type == "text_image":
-            # Kadinsky 2.1 - style
+            # Kandinsky 2.1 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 70b4c8f75874..79e28a42f4c6 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -933,7 +933,6 @@ def forward(
 
             image_embs = added_cond_kwargs.get("image_embeds")
             text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-
             aug_emb = self.add_embedding(text_embs, image_embs)
         elif self.config.addition_embed_type == "text_time":
             if "text_embeds" not in added_cond_kwargs:
@@ -961,7 +960,6 @@ def forward(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the"
                     " keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
                 )
-
             image_embs = added_cond_kwargs.get("image_embeds")
             aug_emb = self.add_embedding(image_embs)
         elif self.config.addition_embed_type == "image_hint":