From a3c8c1c78d10557c641d49077f6778711a8a1125 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 2 Aug 2023 22:27:28 +0000
Subject: [PATCH 01/23] refactor image processor for mask

---
 src/diffusers/image_processor.py | 86 +++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 6ccf9b465ebd..44152a1dfb32 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -51,7 +51,7 @@ def __init__(
         vae_scale_factor: int = 8,
         resample: str = "lanczos",
         do_normalize: bool = True,
-        do_convert_rgb: bool = False,
+        color_mode: Optional[str] = None, # "RGB", "L"
     ):
         super().__init__()
 
@@ -117,13 +117,36 @@ def denormalize(images):
         return (images / 2 + 0.5).clamp(0, 1)
 
     @staticmethod
-    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+    def convert_to_mode(image: PIL.Image.Image, mode: str) -> PIL.Image.Image:
         """
-        Converts an image to RGB format.
+        Converts an image to RGB or L mode.
         """
-        image = image.convert("RGB")
+        image = image.convert(mode)
+
+        if mode == "L":
+            image = image.unsqueeze(0)
         return image
 
+    def get_default_height_width(self, image, height, width):
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            else:
+                height = image.shape[2]
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            else:
+                width = image.shape[3]
+
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        ) # resize to integer multiple of vae_scale_factor
+
+        return height, width
+
     def resize(
         self,
         image: PIL.Image.Image,
@@ -133,17 +156,9 @@ def resize(
         """
         Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`.
         """
-        if height is None:
-            height = image.height
-        if width is None:
-            width = image.width
-
-        width, height = (
-            x - x % self.config.vae_scale_factor for x in (width, height)
-        )  # resize to integer multiple of vae_scale_factor
         image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
         return image
-
+    
     def preprocess(
         self,
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
@@ -154,6 +169,22 @@ def preprocess(
         Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
         """
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+
+
+        height, width = self.get_default_height_width(image, height, width)
+        
+        # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, 
+        # it could have 2 possible shapes:
+        #    1. batch x height x width: we should insert the channel dimension at position 1 
+        #    2. channnel x height x width:  we should insert batch dimension at position 0,
+        #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+        #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+        if self.config.color_mode == "L" and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, torch.Tensor):
+                image = image.unsqueeze(1)
+            else: 
+                image = np.expand_dims(image, axis=1)
+     
         if isinstance(image, supported_formats):
             image = [image]
         elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
@@ -162,8 +193,8 @@ def preprocess(
             )
 
         if isinstance(image[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                image = [self.convert_to_rgb(i) for i in image]
+            if self.config.color_mode is not None:
+                image = [self.convert_to_mode(i, self.config.color_mode) for i in image]
             if self.config.do_resize:
                 image = [self.resize(i, height, width) for i in image]
             image = self.pil_to_numpy(image)  # to np
@@ -171,30 +202,37 @@ def preprocess(
 
         elif isinstance(image[0], np.ndarray):
             image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            
+            if self.config.color_mode == "L" and if image.ndim == 3:
+                image = np.expand_dims(image, axis=1)
+
             image = self.numpy_to_pt(image)
-            _, _, height, width = image.shape
+
             if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+                image.shape[2] != height  or image.shape[3] != width
             ):
                 raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                    f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}"
+                    f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
                 )
 
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
-            _, channel, height, width = image.shape
-
+            
+            if self.config.color_mode == "L" and if image.ndim == 3:
+                image = image.unsqueeze(1)
+            
+            channel = image.shape[1]
             # don't need any preprocess if the image is latents
             if channel == 4:
                 return image
 
             if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+                image.shape[2] != height  or image.shape[3] != width
             ):
                 raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your pytorch tensor to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                    f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}"
+                    f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
                 )
 
         # expected range [0,1], normalize to [-1,1]

From a6bffcaa7485ae8a9f56f3493ad02029c6fc9e72 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 2 Aug 2023 23:32:13 +0000
Subject: [PATCH 02/23] deprecate the prepare_mask_and_masked_image function

---
 src/diffusers/image_processor.py              | 42 +++++++++++++------
 .../pipeline_stable_diffusion_inpaint.py      |  7 +++-
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 44152a1dfb32..210d9d4d750b 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -51,9 +51,17 @@ def __init__(
         vae_scale_factor: int = 8,
         resample: str = "lanczos",
         do_normalize: bool = True,
-        color_mode: Optional[str] = None, # "RGB", "L"
-    ):
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False
+    ):  
         super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            warnings.warn(
+                "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`."
+                FutureWarning,
+            )
+            self.config.do_convert_rgb = False
 
     @staticmethod
     def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
@@ -117,14 +125,22 @@ def denormalize(images):
         return (images / 2 + 0.5).clamp(0, 1)
 
     @staticmethod
-    def convert_to_mode(image: PIL.Image.Image, mode: str) -> PIL.Image.Image:
+    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts an image to RGB format.
+        """
+        image = image.convert("RGB")
+
+        return image
+    
+    @staticmethod
+    def convert_to_grayscale(image: PIL.Image.Image, mode: str) -> PIL.Image.Image:
         """
-        Converts an image to RGB or L mode.
+        Converts an image to L mode.
         """
-        image = image.convert(mode)
+        image = image.convert("L")
+        image = image.unsqueeze(0)
 
-        if mode == "L":
-            image = image.unsqueeze(0)
         return image
 
     def get_default_height_width(self, image, height, width):
@@ -179,7 +195,7 @@ def preprocess(
         #    2. channnel x height x width:  we should insert batch dimension at position 0,
         #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
         #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
-        if self.config.color_mode == "L" and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+        if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
             if isinstance(image, torch.Tensor):
                 image = image.unsqueeze(1)
             else: 
@@ -193,8 +209,10 @@ def preprocess(
             )
 
         if isinstance(image[0], PIL.Image.Image):
-            if self.config.color_mode is not None:
-                image = [self.convert_to_mode(i, self.config.color_mode) for i in image]
+            if self.config.do_convert_rgb:
+                image = [self.convert_to_rgb(i) for i in image]
+            elif self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
             if self.config.do_resize:
                 image = [self.resize(i, height, width) for i in image]
             image = self.pil_to_numpy(image)  # to np
@@ -203,7 +221,7 @@ def preprocess(
         elif isinstance(image[0], np.ndarray):
             image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             
-            if self.config.color_mode == "L" and if image.ndim == 3:
+            if self.config.do_convert_grayscale and if image.ndim == 3:
                 image = np.expand_dims(image, axis=1)
 
             image = self.numpy_to_pt(image)
@@ -219,7 +237,7 @@ def preprocess(
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
             
-            if self.config.color_mode == "L" and if image.ndim == 3:
+            if self.config.do_convert_grayscale and if image.ndim == 3:
                 image = image.unsqueeze(1)
             
             channel = image.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index c347ed56af5b..f70f217ed6fb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
@@ -63,7 +64,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
-
+    warnings.warn(
+        "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")
 

From 11328e538c3280a3ae9a0d379e515dd2641349b4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 2 Aug 2023 23:43:53 +0000
Subject: [PATCH 03/23] refactor inpaint

---
 .../pipeline_stable_diffusion_inpaint.py          | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index f70f217ed6fb..d1a61c369670 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -285,6 +285,7 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
@@ -863,9 +864,17 @@ def __call__(
         is_strength_max = strength == 1.0
 
         # 5. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image, mask_image, height, width, return_image=True
-        )
+
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(mask, height=height, width=width)
+        # binarize mask 
+        mask[mask < 0.5] = 0
+        mask[mask > 0.5] = 1
+
+        masked_image = image * (mask < 0.5)
+
         mask_condition = mask.clone()
 
         # 6. Prepare latent variables

From 520dd473f86529e58857d3d60ef69bb7c6f1f253 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 3 Aug 2023 00:56:28 +0000
Subject: [PATCH 04/23] fix

---
 src/diffusers/image_processor.py                       | 10 +++-------
 .../pipeline_stable_diffusion_inpaint.py               |  4 ++--
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 210d9d4d750b..16b081443d12 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -58,7 +58,7 @@ def __init__(
         if do_convert_rgb and do_convert_grayscale:
             warnings.warn(
                 "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`,"
-                " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`."
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`.",
                 FutureWarning,
             )
             self.config.do_convert_rgb = False
@@ -134,12 +134,11 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
         return image
     
     @staticmethod
-    def convert_to_grayscale(image: PIL.Image.Image, mode: str) -> PIL.Image.Image:
+    def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
         """
         Converts an image to L mode.
         """
         image = image.convert("L")
-        image = image.unsqueeze(0)
 
         return image
 
@@ -220,9 +219,6 @@ def preprocess(
 
         elif isinstance(image[0], np.ndarray):
             image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            
-            if self.config.do_convert_grayscale and if image.ndim == 3:
-                image = np.expand_dims(image, axis=1)
 
             image = self.numpy_to_pt(image)
 
@@ -237,7 +233,7 @@ def preprocess(
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
             
-            if self.config.do_convert_grayscale and if image.ndim == 3:
+            if self.config.do_convert_grayscale and image.ndim == 3:
                 image = image.unsqueeze(1)
             
             channel = image.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index d1a61c369670..bae0c7a3b652 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -868,12 +868,12 @@ def __call__(
         init_image = self.image_processor.preprocess(image, height=height, width=width)
         init_image = init_image.to(dtype=torch.float32)
 
-        mask = self.mask_processor.preprocess(mask, height=height, width=width)
+        mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
         # binarize mask 
         mask[mask < 0.5] = 0
         mask[mask > 0.5] = 1
 
-        masked_image = image * (mask < 0.5)
+        masked_image = init_image * (mask < 0.5)
 
         mask_condition = mask.clone()
 

From 6e1d59c319f540a5bd7443f770ea288f9b29d0eb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 3 Aug 2023 01:25:45 +0000
Subject: [PATCH 05/23] make style

---
 src/diffusers/image_processor.py              | 34 ++++++++-----------
 .../pipeline_stable_diffusion_inpaint.py      |  6 ++--
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 16b081443d12..cd73d806fae6 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -52,8 +52,8 @@ def __init__(
         resample: str = "lanczos",
         do_normalize: bool = True,
         do_convert_rgb: bool = False,
-        do_convert_grayscale: bool = False
-    ):  
+        do_convert_grayscale: bool = False,
+    ):
         super().__init__()
         if do_convert_rgb and do_convert_grayscale:
             warnings.warn(
@@ -132,7 +132,7 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
         image = image.convert("RGB")
 
         return image
-    
+
     @staticmethod
     def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
         """
@@ -143,7 +143,6 @@ def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
         return image
 
     def get_default_height_width(self, image, height, width):
-
         if height is None:
             if isinstance(image, PIL.Image.Image):
                 height = image.height
@@ -158,7 +157,7 @@ def get_default_height_width(self, image, height, width):
 
         width, height = (
             x - x % self.config.vae_scale_factor for x in (width, height)
-        ) # resize to integer multiple of vae_scale_factor
+        )  # resize to integer multiple of vae_scale_factor
 
         return height, width
 
@@ -173,7 +172,7 @@ def resize(
         """
         image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
         return image
-    
+
     def preprocess(
         self,
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
@@ -185,21 +184,20 @@ def preprocess(
         """
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
 
-
         height, width = self.get_default_height_width(image, height, width)
-        
-        # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, 
+
+        # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format,
         # it could have 2 possible shapes:
-        #    1. batch x height x width: we should insert the channel dimension at position 1 
+        #    1. batch x height x width: we should insert the channel dimension at position 1
         #    2. channnel x height x width:  we should insert batch dimension at position 0,
         #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
         #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
         if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
             if isinstance(image, torch.Tensor):
                 image = image.unsqueeze(1)
-            else: 
+            else:
                 image = np.expand_dims(image, axis=1)
-     
+
         if isinstance(image, supported_formats):
             image = [image]
         elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
@@ -222,9 +220,7 @@ def preprocess(
 
             image = self.numpy_to_pt(image)
 
-            if self.config.do_resize and (
-                image.shape[2] != height  or image.shape[3] != width
-            ):
+            if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}"
                     f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
@@ -232,18 +228,16 @@ def preprocess(
 
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
-            
+
             if self.config.do_convert_grayscale and image.ndim == 3:
                 image = image.unsqueeze(1)
-            
+
             channel = image.shape[1]
             # don't need any preprocess if the image is latents
             if channel == 4:
                 return image
 
-            if self.config.do_resize and (
-                image.shape[2] != height  or image.shape[3] != width
-            ):
+            if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}"
                     f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index bae0c7a3b652..33de1528dfd4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -285,7 +285,9 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
@@ -869,7 +871,7 @@ def __call__(
         init_image = init_image.to(dtype=torch.float32)
 
         mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
-        # binarize mask 
+        # binarize mask
         mask[mask < 0.5] = 0
         mask[mask > 0.5] = 1
 

From 84f7037574098797ca71b46867e77401b3756457 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 3 Aug 2023 02:24:07 +0000
Subject: [PATCH 06/23] fix

---
 src/diffusers/image_processor.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index cd73d806fae6..fe05c851003a 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -146,14 +146,18 @@ def get_default_height_width(self, image, height, width):
         if height is None:
             if isinstance(image, PIL.Image.Image):
                 height = image.height
-            else:
+            elif isinstance(image, torch.Tensor):
                 height = image.shape[2]
+            else:
+                height = image.shape[1]
 
         if width is None:
             if isinstance(image, PIL.Image.Image):
                 width = image.width
-            else:
+            elif isinstance(image, torch.Tensor):
                 width = image.shape[3]
+            else:
+                height = image.shape[2]
 
         width, height = (
             x - x % self.config.vae_scale_factor for x in (width, height)
@@ -184,8 +188,6 @@ def preprocess(
         """
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
 
-        height, width = self.get_default_height_width(image, height, width)
-
         # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format,
         # it could have 2 possible shapes:
         #    1. batch x height x width: we should insert the channel dimension at position 1
@@ -211,6 +213,7 @@ def preprocess(
             elif self.config.do_convert_grayscale:
                 image = [self.convert_to_grayscale(i) for i in image]
             if self.config.do_resize:
+                height, width = self.get_default_height_width(image[0], height, width)
                 image = [self.resize(i, height, width) for i in image]
             image = self.pil_to_numpy(image)  # to np
             image = self.numpy_to_pt(image)  # to pt
@@ -220,6 +223,7 @@ def preprocess(
 
             image = self.numpy_to_pt(image)
 
+            height, width = self.get_default_height_width(image, height, width)
             if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}"
@@ -237,6 +241,7 @@ def preprocess(
             if channel == 4:
                 return image
 
+            height, width = self.get_default_height_width(image, height, width)
             if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}"

From 4e46ea18161707711d02b15d5796ff06d9f35c86 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 3 Aug 2023 03:11:21 +0000
Subject: [PATCH 07/23] improve docstring

---
 src/diffusers/image_processor.py | 48 ++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index fe05c851003a..5b0f9e689f6b 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -127,7 +127,7 @@ def denormalize(images):
     @staticmethod
     def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
         """
-        Converts an image to RGB format.
+        Converts a PIL image to RGB format.
         """
         image = image.convert("RGB")
 
@@ -136,13 +136,33 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
     @staticmethod
     def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
         """
-        Converts an image to L mode.
+        Converts a PIL image to grayscale format.
         """
         image = image.convert("L")
 
         return image
 
-    def get_default_height_width(self, image, height, width):
+    def get_default_height_width(
+        self,
+        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ):
+        """
+        This function return the height and width that are downscaled to the next integer multiple of
+        `vae_scale_factor`.
+
+        Args:
+            image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                the image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
+                shape [batch, height, width] or [batch, height, width, channel] if it is a pytorch tensor, should have
+                shape [batch, channel, height, width]
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the height of `image` input
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use the width of the `image` input
+        """
+
         if height is None:
             if isinstance(image, PIL.Image.Image):
                 height = image.height
@@ -172,7 +192,7 @@ def resize(
         width: Optional[int] = None,
     ) -> PIL.Image.Image:
         """
-        Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`.
+        Resize a PIL image.
         """
         image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
         return image
@@ -188,17 +208,23 @@ def preprocess(
         """
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
 
-        # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format,
-        # it could have 2 possible shapes:
-        #    1. batch x height x width: we should insert the channel dimension at position 1
-        #    2. channnel x height x width:  we should insert batch dimension at position 0,
-        #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
-        #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
         if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
             if isinstance(image, torch.Tensor):
+                # if image is a pytorch tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channnel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
                 image = image.unsqueeze(1)
             else:
-                image = np.expand_dims(image, axis=1)
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
 
         if isinstance(image, supported_formats):
             image = [image]

From 3f5e0467cddc626cdb247871e293fe32a490cc58 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 4 Aug 2023 06:52:37 +0000
Subject: [PATCH 08/23] add do_binarize and warning -> error

---
 src/diffusers/image_processor.py              | 20 +++++++++++++++----
 .../pipeline_stable_diffusion_inpaint.py      |  5 +----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 5b0f9e689f6b..d18c538c4b00 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -51,15 +51,16 @@ def __init__(
         vae_scale_factor: int = 8,
         resample: str = "lanczos",
         do_normalize: bool = True,
+        do_binarize: bool = False,
         do_convert_rgb: bool = False,
         do_convert_grayscale: bool = False,
     ):
         super().__init__()
         if do_convert_rgb and do_convert_grayscale:
-            warnings.warn(
-                "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`,"
-                " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`.",
-                FutureWarning,
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
             )
             self.config.do_convert_rgb = False
 
@@ -197,6 +198,14 @@ def resize(
         image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
         return image
 
+    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        create a mask
+        """
+        image[image < 0.5] = 0
+        image[image >= 0.5] = 1
+        return image
+
     def preprocess(
         self,
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
@@ -287,6 +296,9 @@ def preprocess(
         if do_normalize:
             image = self.normalize(image)
 
+        if self.config.do_binarize:
+            image = self.binarize(image)
+
         return image
 
     def postprocess(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 33de1528dfd4..cde7ea949dca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -286,7 +286,7 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.mask_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
@@ -871,9 +871,6 @@ def __call__(
         init_image = init_image.to(dtype=torch.float32)
 
         mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
-        # binarize mask
-        mask[mask < 0.5] = 0
-        mask[mask > 0.5] = 1
 
         masked_image = init_image * (mask < 0.5)
 

From ccbfcabf34b17477771980da8a4ee47792250a23 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 4 Aug 2023 08:21:38 +0000
Subject: [PATCH 09/23] add tests

---
 tests/others/test_image_processor.py | 144 ++++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 3 deletions(-)

diff --git a/tests/others/test_image_processor.py b/tests/others/test_image_processor.py
index c2cd6f4a04f4..f8d22ce9a01b 100644
--- a/tests/others/test_image_processor.py
+++ b/tests/others/test_image_processor.py
@@ -34,6 +34,17 @@ def dummy_sample(self):
 
         return sample
 
+    @property
+    def dummy_mask(self):
+        batch_size = 1
+        num_channels = 1
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+
     def to_np(self, image):
         if isinstance(image[0], PIL.Image.Image):
             return np.stack([np.array(i) for i in image], axis=0)
@@ -133,17 +144,144 @@ def test_preprocess_input_list(self):
         )
 
         input_np_4d = self.to_np(self.dummy_sample)
-        list(input_np_4d)
+        input_np_list = list(input_np_4d)
 
         out_np_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pt_4d),
+            image_processor.preprocess(input_np_4d),
             output_type="np",
         )
 
         out_np_list = image_processor.postprocess(
-            image_processor.preprocess(input_pt_list),
+            image_processor.preprocess(input_np_list),
             output_type="np",
         )
 
         assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
         assert np.abs(out_np_4d - out_np_list).max() < 1e-6
+
+    def test_preprocess_input_mask_3d(self):
+        image_processor = VaeImageProcessor(
+            do_resize=False, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        input_pt_4d = self.dummy_mask
+        input_pt_3d = input_pt_4d.squeeze(0)
+        input_pt_2d = input_pt_3d.squeeze(0)
+
+        out_pt_4d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
+        out_pt_3d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
+
+        out_pt_2d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_2d),
+            output_type="np",
+        )
+
+        input_np_4d = self.to_np(self.dummy_mask)
+        input_np_3d = input_np_4d.squeeze(0)
+        input_np_3d_1 = input_np_4d.squeeze(-1)
+        input_np_2d = input_np_3d.squeeze(-1)
+
+        out_np_4d = image_processor.postprocess(
+            image_processor.preprocess(input_np_4d),
+            output_type="np",
+        )
+        out_np_3d = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+
+        out_np_3d_1 = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d_1),
+            output_type="np",
+        )
+
+        out_np_2d = image_processor.postprocess(
+            image_processor.preprocess(input_np_2d),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_4d - out_pt_3d).max() == 0
+        assert np.abs(out_pt_4d - out_pt_2d).max() == 0
+        assert np.abs(out_np_4d - out_np_3d).max() == 0
+        assert np.abs(out_np_4d - out_np_3d_1).max() == 0
+        assert np.abs(out_np_4d - out_np_2d).max() == 0
+
+    def test_preprocess_input_mask_list(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True)
+
+        input_pt_4d = self.dummy_mask
+        input_pt_3d = input_pt_4d.squeeze(0)
+        input_pt_2d = input_pt_3d.squeeze(0)
+
+        inputs_pt = [input_pt_4d, input_pt_3d, input_pt_2d]
+        inputs_pt_list = [[input_pt] for input_pt in inputs_pt]
+
+        for input_pt, input_pt_list in zip(inputs_pt, inputs_pt_list):
+            out_pt = image_processor.postprocess(
+                image_processor.preprocess(input_pt),
+                output_type="np",
+            )
+            out_pt_list = image_processor.postprocess(
+                image_processor.preprocess(input_pt_list),
+                output_type="np",
+            )
+            assert np.abs(out_pt - out_pt_list).max() < 1e-6
+
+        input_np_4d = self.to_np(self.dummy_mask)
+        input_np_3d = input_np_4d.squeeze(0)
+        input_np_2d = input_np_3d.squeeze(-1)
+
+        inputs_np = [input_np_4d, input_np_3d, input_np_2d]
+        inputs_np_list = [[input_np] for input_np in inputs_np]
+
+        for input_np, input_np_list in zip(inputs_np, inputs_np_list):
+            out_np = image_processor.postprocess(
+                image_processor.preprocess(input_np),
+                output_type="np",
+            )
+            out_np_list = image_processor.postprocess(
+                image_processor.preprocess(input_np_list),
+                output_type="np",
+            )
+            assert np.abs(out_np - out_np_list).max() < 1e-6
+
+    def test_preprocess_input_mask_3d_batch(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True)
+
+        # create a dummy mask input with batch_size 2
+        dummy_mask_batch = torch.cat([self.dummy_mask] * 2, axis=0)
+
+        # squeeze out the channel dimension
+        input_pt_3d = dummy_mask_batch.squeeze(1)
+        input_np_3d = self.to_np(dummy_mask_batch).squeeze(-1)
+
+        input_pt_3d_list = list(input_pt_3d)
+        input_np_3d_list = list(input_np_3d)
+
+        out_pt_3d = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
+        out_pt_3d_list = image_processor.postprocess(
+            image_processor.preprocess(input_pt_3d_list),
+            output_type="np",
+        )
+
+        assert np.abs(out_pt_3d - out_pt_3d_list).max() < 1e-6
+
+        out_np_3d = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+        out_np_3d_list = image_processor.postprocess(
+            image_processor.preprocess(input_np_3d_list),
+            output_type="np",
+        )
+
+        assert np.abs(out_np_3d - out_np_3d_list).max() < 1e-6

From 6aa4114a9b25ee755eddb5646c954e707a86c1ae Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 4 Aug 2023 21:25:36 +0000
Subject: [PATCH 10/23] apply feedback

---
 src/diffusers/image_processor.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index d18c538c4b00..30969dac599c 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -155,13 +155,13 @@ def get_default_height_width(
 
         Args:
             image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
-                the image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
-                shape [batch, height, width] or [batch, height, width, channel] if it is a pytorch tensor, should have
-                shape [batch, channel, height, width]
+                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
+                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
+                have shape `[batch, channel, height, width]`.
             height (`int`, *optional*, defaults to `None`):
-                The height in preprocessed image. If `None`, will use the height of `image` input
+                The height in preprocessed image. If `None`, will use the height of `image` input.
             width (`int`, *optional*`, defaults to `None`):
-                The width in preprocessed. If `None`, will use the width of the `image` input
+                The width in preprocessed. If `None`, will use the width of the `image` input.
         """
 
         if height is None:

From 8d7c0916a95baab8f98e8de6ac1af977795767c1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 4 Aug 2023 21:26:36 +0000
Subject: [PATCH 11/23] fix copies

---
 .../pipelines/controlnet/pipeline_controlnet_inpaint.py     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 0c5e0b6d726e..854b4bf2cbcf 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -133,7 +133,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
-
+    warnings.warn(
+        "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")
 

From 0f09e72bc81152adfe4a48f9128c92bdb1e9ce00 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 4 Aug 2023 21:30:05 +0000
Subject: [PATCH 12/23] docstring

---
 src/diffusers/image_processor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 30969dac599c..dfb06f50bdc4 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -38,8 +38,12 @@ class VaeImageProcessor(ConfigMixin):
             Resampling filter to use when resizing the image.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `True`):
+            Whether to binarize the image to 0/1.
         do_convert_rgb (`bool`, *optional*, defaults to be `False`):
             Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
     """
 
     config_name = CONFIG_NAME

From 5a86b88a3a8d5f3a4c767e1752ed857ede580b66 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 5 Aug 2023 03:31:37 +0000
Subject: [PATCH 13/23] refactor controlnet inpaint

---
 .../controlnet/pipeline_controlnet_inpaint.py | 87 ++++++++++---------
 .../pipeline_stable_diffusion_img2img.py      |  7 +-
 .../pipeline_stable_diffusion_inpaint.py      | 35 ++++++--
 3 files changed, 79 insertions(+), 50 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 854b4bf2cbcf..a38327bf637c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -320,6 +320,9 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
         self.control_image_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
         )
@@ -612,7 +615,7 @@ def check_inputs(
         control_guidance_start=0.0,
         control_guidance_end=1.0,
     ):
-        if height % 8 != 0 or width % 8 != 0:
+        if height is not None and height % 8 != 0 or width is not None and width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
@@ -860,31 +863,6 @@ def prepare_latents(
 
         return outputs
 
-    def _default_height_width(self, height, width, image):
-        # NOTE: It is possible that a list of images have different
-        # dimensions for each image, so just checking the first image
-        # is not _exactly_ correct, but it is simple.
-        while isinstance(image, list):
-            image = image[0]
-
-        if height is None:
-            if isinstance(image, PIL.Image.Image):
-                height = image.height
-            elif isinstance(image, torch.Tensor):
-                height = image.shape[2]
-
-            height = (height // 8) * 8  # round down to nearest multiple of 8
-
-        if width is None:
-            if isinstance(image, PIL.Image.Image):
-                width = image.width
-            elif isinstance(image, torch.Tensor):
-                width = image.shape[3]
-
-            width = (width // 8) * 8  # round down to nearest multiple of 8
-
-        return height, width
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
     def prepare_mask_latents(
         self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
@@ -947,8 +925,22 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[torch.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
+        image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        mask_image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
         control_image: Union[
             torch.FloatTensor,
             PIL.Image.Image,
@@ -986,13 +978,27 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, 
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked
+                out with `mask_image` and repainted according to `prompt`).
+                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
+                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
+                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
+                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, 
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
+                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
+                (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, 
+                so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array
+                would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
                     `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. 
+                The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, 
+                `image` is resized according to them. If multiple ControlNets are specified in init, 
+                images must be passed as a list such that each element of the list can be correctly
                 batched for input to a single controlnet.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
@@ -1077,9 +1083,6 @@ def __call__(
         """
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
 
-        # 0. Default height and width to unet
-        height, width = self._default_height_width(height, width, image)
-
         # align format for control guidance
         if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
             control_guidance_start = len(control_guidance_end) * [control_guidance_start]
@@ -1181,9 +1184,13 @@ def __call__(
             assert False
 
         # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image, mask_image, height, width, return_image=True
-        )
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index e6c3a6ae4d69..27499b3e11fc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -596,8 +596,11 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
-                latents as `image`, but if passing latents directly it is not encoded again.
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. 
+                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
+                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
+                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
+                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
             strength (`float`, *optional*, defaults to 0.8):
                 Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                 starting point and more noise is added the higher the `strength`. The number of denoising steps depends
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index cde7ea949dca..60b81b032583 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -680,8 +680,22 @@ def get_timesteps(self, num_inference_steps, strength, device):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        mask_image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
@@ -706,14 +720,19 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`PIL.Image.Image`):
-                `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked
                 out with `mask_image` and repainted according to `prompt`).
-            mask_image (`PIL.Image.Image`):
-                `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
+                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
+                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
+                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
+                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
                 while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
-                (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
-                expected shape would be `(B, H, W, 1)`.
+                (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, 
+                so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array
+                would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):

From 12cf87e6a243cf4fe60fe9e2324b5b958a0ebe84 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 5 Aug 2023 03:38:14 +0000
Subject: [PATCH 14/23] style

---
 src/diffusers/image_processor.py              |  2 +-
 .../controlnet/pipeline_controlnet_inpaint.py | 37 ++++++++++---------
 .../pipeline_stable_diffusion_img2img.py      | 10 ++---
 .../pipeline_stable_diffusion_inpaint.py      | 23 ++++++------
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index dfb06f50bdc4..dda5182e1ed7 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -289,7 +289,7 @@ def preprocess(
 
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.config.do_normalize
-        if image.min() < 0:
+        if image.min() < 0 and do_normalize:
             warnings.warn(
                 "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index a38327bf637c..0590411a1f2d 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -978,28 +978,29 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, 
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
                     `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked
-                out with `mask_image` and repainted according to `prompt`).
-                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
-                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
-                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
-                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
-            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, 
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
                     `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
-                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
-                (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, 
-                so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array
-                would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`.
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
             control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
                     `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. 
-                The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, 
-                `image` is resized according to them. If multiple ControlNets are specified in init, 
-                images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. The
+                dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed,
+                `image` is resized according to them. If multiple ControlNets are specified in init, images must be
+                passed as a list such that each element of the list can be correctly batched for input to a single
+                controlnet.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 27499b3e11fc..355910747637 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -596,11 +596,11 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be used as the starting point. 
-                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
-                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
-                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
-                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
             strength (`float`, *optional*, defaults to 0.8):
                 Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                 starting point and more noise is added the higher the `strength`. The number of denoising steps depends
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 60b81b032583..bc3ad52e8d4b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -721,18 +721,19 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked
-                out with `mask_image` and repainted according to `prompt`).
-                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`
-                If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`.
-                If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
-                It can also accept image latents as `image`, but if passing latents directly it is not encoded again.
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
             mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
-                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
-                (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, 
-                so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array
-                would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`.
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):

From 05ab579676eb3039fc6b18267f523d041f87dc98 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 5 Aug 2023 03:44:36 +0000
Subject: [PATCH 15/23] fix copies

---
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py          | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 2c2fa927be8c..4cff3c9c2bfc 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -590,7 +590,10 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
                 latents as `image`, but if passing latents directly it is not encoded again.
             strength (`float`, *optional*, defaults to 0.8):
                 Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a

From b813ee6c93ba9f44e3602eb40b159094cbd95cf2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 01:03:17 +0000
Subject: [PATCH 16/23] add ImageInput type

---
 src/diffusers/image_processor.py              | 10 +++++++
 .../pipeline_alt_diffusion_img2img.py         | 11 ++-----
 .../controlnet/pipeline_controlnet.py         | 11 ++-----
 .../controlnet/pipeline_controlnet_img2img.py | 20 ++-----------
 .../controlnet/pipeline_controlnet_inpaint.py | 29 +++----------------
 .../controlnet/pipeline_controlnet_sd_xl.py   | 11 ++-----
 .../pipeline_cycle_diffusion.py               | 11 ++-----
 .../pipeline_stable_diffusion_depth2img.py    | 11 ++-----
 .../pipeline_stable_diffusion_img2img.py      | 11 ++-----
 .../pipeline_stable_diffusion_inpaint.py      | 20 ++-----------
 ...eline_stable_diffusion_instruct_pix2pix.py | 11 ++-----
 ...ipeline_stable_diffusion_latent_upscale.py | 11 ++-----
 .../pipeline_stable_diffusion_pix2pix_zero.py | 11 ++-----
 .../pipeline_stable_diffusion_upscale.py      | 11 ++-----
 .../pipeline_stable_diffusion_xl_img2img.py   | 12 ++------
 15 files changed, 42 insertions(+), 159 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index dda5182e1ed7..a298e7b50ad8 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -24,6 +24,16 @@
 from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
 
 
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "torch.FloatTensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["torch.FloatTensor"],
+]
+
+
 class VaeImageProcessor(ConfigMixin):
     """
     Image processor for VAE.
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 4cff3c9c2bfc..6d164baeef96 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -25,7 +25,7 @@
 from diffusers.utils import is_accelerate_available, is_accelerate_version
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -560,14 +560,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 4320240dac63..7d11e3da6a07 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -23,7 +23,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -665,14 +665,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index db57d556ad26..24d0acacac20 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -23,7 +23,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -743,22 +743,8 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        control_image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
+        control_image: ImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 0.8,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 0590411a1f2d..1497d65fb3a9 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -24,7 +24,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -925,30 +925,9 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        mask_image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        control_image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
+        mask_image: ImageInput = None,
+        control_image: ImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 29d153ba0485..e07866095f20 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -24,7 +24,7 @@
 
 from diffusers.utils.import_utils import is_invisible_watermark_available
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -648,14 +648,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 29fd5a2df3d0..c22ab19f21fa 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -25,7 +25,7 @@
 from diffusers.utils import is_accelerate_available, is_accelerate_version
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDIMScheduler
@@ -571,14 +571,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]],
         source_prompt: Union[str, List[str]],
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index b976482a9165..96119971dbcc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -24,7 +24,7 @@
 from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -492,14 +492,7 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         depth_map: Optional[torch.FloatTensor] = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 355910747637..66eb9e728e3f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -23,7 +23,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -566,14 +566,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index bc3ad52e8d4b..4990396879d4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -23,7 +23,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -680,22 +680,8 @@ def get_timesteps(self, num_inference_steps, strength, device):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        mask_image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
+        mask_image: ImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index d27f8a21f369..02ebfcc54297 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -147,14 +147,7 @@ def __init__(
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         num_inference_steps: int = 100,
         guidance_scale: float = 7.5,
         image_guidance_scale: float = 1.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index cad82cb71940..fbc4a537c2e0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -21,7 +21,7 @@
 import torch.nn.functional as F
 from transformers import CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import logging, randn_tensor
@@ -257,14 +257,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         num_inference_steps: int = 75,
         guidance_scale: float = 9.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 960c4369e45a..b3df3785d147 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -29,7 +29,7 @@
     CLIPTokenizer,
 )
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import Attention
@@ -1056,14 +1056,7 @@ def __call__(
     def invert(
         self,
         prompt: Optional[str] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 582bf6223d44..0542b254df52 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -489,14 +489,7 @@ def upcast_vae(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         num_inference_steps: int = 75,
         guidance_scale: float = 9.0,
         noise_level: int = 20,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index e69e4bc74d43..e14b26976fab 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -15,12 +15,11 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import PIL.Image
 import torch
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import ImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -656,14 +655,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: ImageInput = None,
         strength: float = 0.3,
         num_inference_steps: int = 50,
         denoising_start: Optional[float] = None,

From 8a225bce875f6f9c9574c21ef366efa455d93dfa Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 01:12:53 +0000
Subject: [PATCH 17/23] fix

---
 src/diffusers/image_processor.py                       | 10 +++++-----
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py    |  4 ++--
 .../pipelines/controlnet/pipeline_controlnet.py        |  4 ++--
 .../controlnet/pipeline_controlnet_img2img.py          |  6 +++---
 .../controlnet/pipeline_controlnet_inpaint.py          |  8 ++++----
 .../pipelines/controlnet/pipeline_controlnet_sd_xl.py  |  4 ++--
 .../stable_diffusion/pipeline_cycle_diffusion.py       |  4 ++--
 .../pipeline_stable_diffusion_depth2img.py             |  4 ++--
 .../pipeline_stable_diffusion_img2img.py               |  4 ++--
 .../pipeline_stable_diffusion_inpaint.py               |  6 +++---
 .../pipeline_stable_diffusion_instruct_pix2pix.py      |  4 ++--
 .../pipeline_stable_diffusion_latent_upscale.py        |  4 ++--
 .../pipeline_stable_diffusion_pix2pix_zero.py          |  4 ++--
 .../pipeline_stable_diffusion_upscale.py               |  4 ++--
 .../pipeline_stable_diffusion_xl_img2img.py            |  4 ++--
 15 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index a298e7b50ad8..097257c1a5a8 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -24,13 +24,13 @@
 from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
 
 
-ImageInput = Union[
-    "PIL.Image.Image",
+PipelineImageInput = Union[
+    PIL.Image.Image,
     np.ndarray,
-    "torch.FloatTensor",
-    List["PIL.Image.Image"],
+    torch.FloatTensor,
+    List[PIL.Image.Image],
     List[np.ndarray],
-    List["torch.FloatTensor"],
+    List[torch.FloatTensor],
 ]
 
 
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 6d164baeef96..0d8a34425fef 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -25,7 +25,7 @@
 from diffusers.utils import is_accelerate_available, is_accelerate_version
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -560,7 +560,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 7d11e3da6a07..58ae74149134 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -23,7 +23,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -665,7 +665,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 24d0acacac20..4ed5d25b5bd5 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -23,7 +23,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -743,8 +743,8 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
-        control_image: ImageInput = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 0.8,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 1497d65fb3a9..977c1bb0f80c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -24,7 +24,7 @@
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -925,9 +925,9 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
-        mask_image: ImageInput = None,
-        control_image: ImageInput = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index e07866095f20..fcc92a066e21 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -24,7 +24,7 @@
 
 from diffusers.utils.import_utils import is_invisible_watermark_available
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -648,7 +648,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index c22ab19f21fa..cc64a4a019a2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -25,7 +25,7 @@
 from diffusers.utils import is_accelerate_available, is_accelerate_version
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDIMScheduler
@@ -571,7 +571,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]],
         source_prompt: Union[str, List[str]],
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 96119971dbcc..53dc28397933 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -24,7 +24,7 @@
 from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -492,7 +492,7 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         depth_map: Optional[torch.FloatTensor] = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 66eb9e728e3f..f01cc0758e36 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -23,7 +23,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -566,7 +566,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 4990396879d4..c31d78ab50fb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -23,7 +23,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -680,8 +680,8 @@ def get_timesteps(self, num_inference_steps, strength, device):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
-        mask_image: ImageInput = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 02ebfcc54297..3253c135d6e6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -147,7 +147,7 @@ def __init__(
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         num_inference_steps: int = 100,
         guidance_scale: float = 7.5,
         image_guidance_scale: float = 1.5,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index fbc4a537c2e0..79501a78cdd1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -21,7 +21,7 @@
 import torch.nn.functional as F
 from transformers import CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import logging, randn_tensor
@@ -257,7 +257,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         num_inference_steps: int = 75,
         guidance_scale: float = 9.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index b3df3785d147..de42093478d8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -29,7 +29,7 @@
     CLIPTokenizer,
 )
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import Attention
@@ -1056,7 +1056,7 @@ def __call__(
     def invert(
         self,
         prompt: Optional[str] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 0542b254df52..d2e61a1b7666 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -489,7 +489,7 @@ def upcast_vae(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         num_inference_steps: int = 75,
         guidance_scale: float = 9.0,
         noise_level: int = 20,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index e14b26976fab..e4364ee1679d 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -19,7 +19,7 @@
 import torch
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
-from ...image_processor import ImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -655,7 +655,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
-        image: ImageInput = None,
+        image: PipelineImageInput = None,
         strength: float = 0.3,
         num_inference_steps: int = 50,
         denoising_start: Optional[float] = None,

From 7c920aca6cc3503312eed030d3548b7d839c5fc7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 01:38:48 +0000
Subject: [PATCH 18/23] warning -> deprecate

---
 .../controlnet/pipeline_controlnet_inpaint.py        | 12 ++++++++----
 .../pipeline_stable_diffusion_inpaint.py             | 12 +++++++-----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 977c1bb0f80c..fdcf4e2e37db 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -29,6 +29,7 @@
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     is_compiled_module,
@@ -133,10 +134,13 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
-    warnings.warn(
-        "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
-        " use VaeImageProcessor.preprocess instead",
-        FutureWarning,
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.21.0",
+        message=(
+            "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor.preprocess instead",
+        ),
     )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index c31d78ab50fb..034f3fd3d972 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
@@ -64,10 +63,13 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
-    warnings.warn(
-        "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
-        " use VaeImageProcessor.preprocess instead",
-        FutureWarning,
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.21.0",
+        message=(
+            "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor.preprocess instead",
+        ),
     )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")

From 317b13012f30eab66a1eb9e6112df424802f5d2c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 02:03:45 +0000
Subject: [PATCH 19/23] fix

---
 .../pipelines/controlnet/pipeline_controlnet_inpaint.py     | 6 ++----
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py   | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index fdcf4e2e37db..6d01c3d22e26 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -134,13 +134,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
     deprecate(
         "prepare_mask_and_masked_image",
         "0.21.0",
-        message=(
-            "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
-            " use VaeImageProcessor.preprocess instead",
-        ),
+        deprecation_message,
     )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 034f3fd3d972..0d5f7012e981 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -63,13 +63,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
         tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
     deprecate(
         "prepare_mask_and_masked_image",
         "0.21.0",
-        message=(
-            "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please"
-            " use VaeImageProcessor.preprocess instead",
-        ),
+        deprecation_message,
     )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")

From 280709f2bd1ec4e65b72cc48d378859d9f5d98ab Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 02:33:10 +0000
Subject: [PATCH 20/23] refator sdxl-inpaint

---
 .../pipeline_stable_diffusion_xl_inpaint.py   | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 8b96b558ec7c..112dec518421 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -20,7 +20,7 @@
 import torch
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -139,6 +139,12 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
     """
 
     # checkpoint. TOD(Yiyi) - need to clean this up later
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.21.0",
+        deprecation_message,
+    )
     if image is None:
         raise ValueError("`image` input cannot be undefined.")
 
@@ -292,6 +298,9 @@ def __init__(
         self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
 
         add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
 
@@ -859,8 +868,8 @@ def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 1.0,
@@ -1100,9 +1109,12 @@ def denoising_value_valid(dnv):
         is_strength_max = strength == 1.0
 
         # 5. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image, mask_image, height, width, return_image=True
-        )
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+
+        masked_image = init_image * (mask < 0.5)
 
         # 6. Prepare latent variables
         num_channels_latents = self.vae.config.latent_channels

From 51442dcec460a4145838d518c7245f36c6c272a0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 03:16:14 +0000
Subject: [PATCH 21/23] fix image latent

---
 .../pipeline_stable_diffusion_xl_inpaint.py                 | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 112dec518421..78f75dcffac8 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -1114,7 +1114,11 @@ def denoising_value_valid(dnv):
 
         mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
 
-        masked_image = init_image * (mask < 0.5)
+        if init_image.shape[1] ==4: 
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
 
         # 6. Prepare latent variables
         num_channels_latents = self.vae.config.latent_channels

From c24c2ad24ae9391139c35667316a0f1d753fa7a7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 03:18:01 +0000
Subject: [PATCH 22/23] style

---
 .../pipeline_stable_diffusion_xl_inpaint.py                    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 78f75dcffac8..42464d79427e 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -31,6 +31,7 @@
 )
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     is_invisible_watermark_available,
@@ -1114,7 +1115,7 @@ def denoising_value_valid(dnv):
 
         mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
 
-        if init_image.shape[1] ==4: 
+        if init_image.shape[1] == 4:
             # if images are in latent space, we can't mask it
             masked_image = None
         else:

From c1719124e644bd1542ba322b8fb2fa7a2881f459 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 24 Aug 2023 03:41:30 +0000
Subject: [PATCH 23/23] fix

---
 .../controlnet/pipeline_controlnet_inpaint.py        |  2 +-
 .../pipeline_stable_diffusion_inpaint.py             |  2 +-
 .../pipeline_stable_diffusion_xl_inpaint.py          |  2 +-
 .../pipeline_stable_diffusion_xl_instruct_pix2pix.py | 12 ++----------
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 6d01c3d22e26..145ff9714c78 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -137,7 +137,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
     deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
     deprecate(
         "prepare_mask_and_masked_image",
-        "0.21.0",
+        "0.30.0",
         deprecation_message,
     )
     if image is None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 0d5f7012e981..67c232b143c6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -66,7 +66,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
     deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
     deprecate(
         "prepare_mask_and_masked_image",
-        "0.21.0",
+        "0.30.0",
         deprecation_message,
     )
     if image is None:
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 42464d79427e..c08f3f1d5993 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -143,7 +143,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
     deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
     deprecate(
         "prepare_mask_and_masked_image",
-        "0.21.0",
+        "0.30.0",
         deprecation_message,
     )
     if image is None:
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index eec5f840277a..3b9bf1a17671 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -15,12 +15,11 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import PIL.Image
 import torch
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
@@ -588,14 +587,7 @@ def upcast_vae(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
+        image: PipelineImageInput = None,
         num_inference_steps: int = 100,
         guidance_scale: float = 7.5,
         image_guidance_scale: float = 1.5,