From a3c8c1c78d10557c641d49077f6778711a8a1125 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Wed, 2 Aug 2023 22:27:28 +0000 Subject: [PATCH 01/23] refactor image processor for mask --- src/diffusers/image_processor.py | 86 +++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 6ccf9b465ebd..44152a1dfb32 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -51,7 +51,7 @@ def __init__( vae_scale_factor: int = 8, resample: str = "lanczos", do_normalize: bool = True, - do_convert_rgb: bool = False, + color_mode: Optional[str] = None, # "RGB", "L" ): super().__init__() @@ -117,13 +117,36 @@ def denormalize(images): return (images / 2 + 0.5).clamp(0, 1) @staticmethod - def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: + def convert_to_mode(image: PIL.Image.Image, mode: str) -> PIL.Image.Image: """ - Converts an image to RGB format. + Converts an image to RGB or L mode. """ - image = image.convert("RGB") + image = image.convert(mode) + + if mode == "L": + image = image.unsqueeze(0) return image + def get_default_height_width(self, image, height, width): + + if height is None: + if isinstance(image, PIL.Image.Image): + height = image.height + else: + height = image.shape[2] + + if width is None: + if isinstance(image, PIL.Image.Image): + width = image.width + else: + width = image.shape[3] + + width, height = ( + x - x % self.config.vae_scale_factor for x in (width, height) + ) # resize to integer multiple of vae_scale_factor + + return height, width + def resize( self, image: PIL.Image.Image, @@ -133,17 +156,9 @@ def resize( """ Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`. """ - if height is None: - height = image.height - if width is None: - width = image.width - - width, height = ( - x - x % self.config.vae_scale_factor for x in (width, height) - ) # resize to integer multiple of vae_scale_factor image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) return image - + def preprocess( self, image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], @@ -154,6 +169,22 @@ def preprocess( Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors. """ supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) + + + height, width = self.get_default_height_width(image, height, width) + + # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, + # it could have 2 possible shapes: + # 1. batch x height x width: we should insert the channel dimension at position 1 + # 2. channnel x height x width: we should insert batch dimension at position 0, + # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 + # for simplicity, we insert a dimension of size 1 at position 1 for both cases + if self.config.color_mode == "L" and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: + if isinstance(image, torch.Tensor): + image = image.unsqueeze(1) + else: + image = np.expand_dims(image, axis=1) + if isinstance(image, supported_formats): image = [image] elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): @@ -162,8 +193,8 @@ def preprocess( ) if isinstance(image[0], PIL.Image.Image): - if self.config.do_convert_rgb: - image = [self.convert_to_rgb(i) for i in image] + if self.config.color_mode is not None: + image = [self.convert_to_mode(i, self.config.color_mode) for i in image] if self.config.do_resize: image = [self.resize(i, height, width) for i in image] image = self.pil_to_numpy(image) # to np @@ -171,30 +202,37 @@ def preprocess( elif isinstance(image[0], np.ndarray): image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) + + if self.config.color_mode == "L" and if image.ndim == 3: + image = np.expand_dims(image, axis=1) + image = self.numpy_to_pt(image) - _, _, height, width = image.shape + if self.config.do_resize and ( - height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0 + image.shape[2] != height or image.shape[3] != width ): raise ValueError( - f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}" - f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" + f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}" + f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" ) elif isinstance(image[0], torch.Tensor): image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) - _, channel, height, width = image.shape - + + if self.config.color_mode == "L" and if image.ndim == 3: + image = image.unsqueeze(1) + + channel = image.shape[1] # don't need any preprocess if the image is latents if channel == 4: return image if self.config.do_resize and ( - height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0 + image.shape[2] != height or image.shape[3] != width ): raise ValueError( - f"Currently we only support resizing for PIL image - please resize your pytorch tensor to be divisible by {self.config.vae_scale_factor}" - f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" + f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}" + f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" ) # expected range [0,1], normalize to [-1,1] From a6bffcaa7485ae8a9f56f3493ad02029c6fc9e72 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Wed, 2 Aug 2023 23:32:13 +0000 Subject: [PATCH 02/23] deprecate the prepare_mask_and_masked_image function --- src/diffusers/image_processor.py | 42 +++++++++++++------ .../pipeline_stable_diffusion_inpaint.py | 7 +++- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 44152a1dfb32..210d9d4d750b 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -51,9 +51,17 @@ def __init__( vae_scale_factor: int = 8, resample: str = "lanczos", do_normalize: bool = True, - color_mode: Optional[str] = None, # "RGB", "L" - ): + do_convert_rgb: bool = False, + do_convert_grayscale: bool = False + ): super().__init__() + if do_convert_rgb and do_convert_grayscale: + warnings.warn( + "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`," + " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`." + FutureWarning, + ) + self.config.do_convert_rgb = False @staticmethod def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image: @@ -117,14 +125,22 @@ def denormalize(images): return (images / 2 + 0.5).clamp(0, 1) @staticmethod - def convert_to_mode(image: PIL.Image.Image, mode: str) -> PIL.Image.Image: + def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: + """ + Converts an image to RGB format. + """ + image = image.convert("RGB") + + return image + + @staticmethod + def convert_to_grayscale(image: PIL.Image.Image, mode: str) -> PIL.Image.Image: """ - Converts an image to RGB or L mode. + Converts an image to L mode. """ - image = image.convert(mode) + image = image.convert("L") + image = image.unsqueeze(0) - if mode == "L": - image = image.unsqueeze(0) return image def get_default_height_width(self, image, height, width): @@ -179,7 +195,7 @@ def preprocess( # 2. channnel x height x width: we should insert batch dimension at position 0, # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 # for simplicity, we insert a dimension of size 1 at position 1 for both cases - if self.config.color_mode == "L" and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: + if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: if isinstance(image, torch.Tensor): image = image.unsqueeze(1) else: @@ -193,8 +209,10 @@ def preprocess( ) if isinstance(image[0], PIL.Image.Image): - if self.config.color_mode is not None: - image = [self.convert_to_mode(i, self.config.color_mode) for i in image] + if self.config.do_convert_rgb: + image = [self.convert_to_rgb(i) for i in image] + elif self.config.do_convert_grayscale: + image = [self.convert_to_grayscale(i) for i in image] if self.config.do_resize: image = [self.resize(i, height, width) for i in image] image = self.pil_to_numpy(image) # to np @@ -203,7 +221,7 @@ def preprocess( elif isinstance(image[0], np.ndarray): image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) - if self.config.color_mode == "L" and if image.ndim == 3: + if self.config.do_convert_grayscale and if image.ndim == 3: image = np.expand_dims(image, axis=1) image = self.numpy_to_pt(image) @@ -219,7 +237,7 @@ def preprocess( elif isinstance(image[0], torch.Tensor): image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) - if self.config.color_mode == "L" and if image.ndim == 3: + if self.config.do_convert_grayscale and if image.ndim == 3: image = image.unsqueeze(1) channel = image.shape[1] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index c347ed56af5b..f70f217ed6fb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -63,7 +64,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ - + warnings.warn( + "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if image is None: raise ValueError("`image` input cannot be undefined.") From 11328e538c3280a3ae9a0d379e515dd2641349b4 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Wed, 2 Aug 2023 23:43:53 +0000 Subject: [PATCH 03/23] refactor inpaint --- .../pipeline_stable_diffusion_inpaint.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f70f217ed6fb..d1a61c369670 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -285,6 +285,7 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload @@ -863,9 +864,17 @@ def __call__( is_strength_max = strength == 1.0 # 5. Preprocess mask and image - mask, masked_image, init_image = prepare_mask_and_masked_image( - image, mask_image, height, width, return_image=True - ) + + init_image = self.image_processor.preprocess(image, height=height, width=width) + init_image = init_image.to(dtype=torch.float32) + + mask = self.mask_processor.preprocess(mask, height=height, width=width) + # binarize mask + mask[mask < 0.5] = 0 + mask[mask > 0.5] = 1 + + masked_image = image * (mask < 0.5) + mask_condition = mask.clone() # 6. Prepare latent variables From 520dd473f86529e58857d3d60ef69bb7c6f1f253 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 3 Aug 2023 00:56:28 +0000 Subject: [PATCH 04/23] fix --- src/diffusers/image_processor.py | 10 +++------- .../pipeline_stable_diffusion_inpaint.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 210d9d4d750b..16b081443d12 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -58,7 +58,7 @@ def __init__( if do_convert_rgb and do_convert_grayscale: warnings.warn( "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`," - " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`." + " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`.", FutureWarning, ) self.config.do_convert_rgb = False @@ -134,12 +134,11 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: return image @staticmethod - def convert_to_grayscale(image: PIL.Image.Image, mode: str) -> PIL.Image.Image: + def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image: """ Converts an image to L mode. """ image = image.convert("L") - image = image.unsqueeze(0) return image @@ -220,9 +219,6 @@ def preprocess( elif isinstance(image[0], np.ndarray): image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) - - if self.config.do_convert_grayscale and if image.ndim == 3: - image = np.expand_dims(image, axis=1) image = self.numpy_to_pt(image) @@ -237,7 +233,7 @@ def preprocess( elif isinstance(image[0], torch.Tensor): image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) - if self.config.do_convert_grayscale and if image.ndim == 3: + if self.config.do_convert_grayscale and image.ndim == 3: image = image.unsqueeze(1) channel = image.shape[1] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index d1a61c369670..bae0c7a3b652 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -868,12 +868,12 @@ def __call__( init_image = self.image_processor.preprocess(image, height=height, width=width) init_image = init_image.to(dtype=torch.float32) - mask = self.mask_processor.preprocess(mask, height=height, width=width) + mask = self.mask_processor.preprocess(mask_image, height=height, width=width) # binarize mask mask[mask < 0.5] = 0 mask[mask > 0.5] = 1 - masked_image = image * (mask < 0.5) + masked_image = init_image * (mask < 0.5) mask_condition = mask.clone() From 6e1d59c319f540a5bd7443f770ea288f9b29d0eb Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 3 Aug 2023 01:25:45 +0000 Subject: [PATCH 05/23] make style --- src/diffusers/image_processor.py | 34 ++++++++----------- .../pipeline_stable_diffusion_inpaint.py | 6 ++-- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 16b081443d12..cd73d806fae6 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -52,8 +52,8 @@ def __init__( resample: str = "lanczos", do_normalize: bool = True, do_convert_rgb: bool = False, - do_convert_grayscale: bool = False - ): + do_convert_grayscale: bool = False, + ): super().__init__() if do_convert_rgb and do_convert_grayscale: warnings.warn( @@ -132,7 +132,7 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: image = image.convert("RGB") return image - + @staticmethod def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image: """ @@ -143,7 +143,6 @@ def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image: return image def get_default_height_width(self, image, height, width): - if height is None: if isinstance(image, PIL.Image.Image): height = image.height @@ -158,7 +157,7 @@ def get_default_height_width(self, image, height, width): width, height = ( x - x % self.config.vae_scale_factor for x in (width, height) - ) # resize to integer multiple of vae_scale_factor + ) # resize to integer multiple of vae_scale_factor return height, width @@ -173,7 +172,7 @@ def resize( """ image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) return image - + def preprocess( self, image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], @@ -185,21 +184,20 @@ def preprocess( """ supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - height, width = self.get_default_height_width(image, height, width) - - # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, + + # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, # it could have 2 possible shapes: - # 1. batch x height x width: we should insert the channel dimension at position 1 + # 1. batch x height x width: we should insert the channel dimension at position 1 # 2. channnel x height x width: we should insert batch dimension at position 0, # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 # for simplicity, we insert a dimension of size 1 at position 1 for both cases if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: if isinstance(image, torch.Tensor): image = image.unsqueeze(1) - else: + else: image = np.expand_dims(image, axis=1) - + if isinstance(image, supported_formats): image = [image] elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): @@ -222,9 +220,7 @@ def preprocess( image = self.numpy_to_pt(image) - if self.config.do_resize and ( - image.shape[2] != height or image.shape[3] != width - ): + if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width): raise ValueError( f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}" f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" @@ -232,18 +228,16 @@ def preprocess( elif isinstance(image[0], torch.Tensor): image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) - + if self.config.do_convert_grayscale and image.ndim == 3: image = image.unsqueeze(1) - + channel = image.shape[1] # don't need any preprocess if the image is latents if channel == 4: return image - if self.config.do_resize and ( - image.shape[2] != height or image.shape[3] != width - ): + if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width): raise ValueError( f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}" f"currently the sizes are {image.shape[2]} and {image.shape[3]}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index bae0c7a3b652..33de1528dfd4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -285,7 +285,9 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload @@ -869,7 +871,7 @@ def __call__( init_image = init_image.to(dtype=torch.float32) mask = self.mask_processor.preprocess(mask_image, height=height, width=width) - # binarize mask + # binarize mask mask[mask < 0.5] = 0 mask[mask > 0.5] = 1 From 84f7037574098797ca71b46867e77401b3756457 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 3 Aug 2023 02:24:07 +0000 Subject: [PATCH 06/23] fix --- src/diffusers/image_processor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index cd73d806fae6..fe05c851003a 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -146,14 +146,18 @@ def get_default_height_width(self, image, height, width): if height is None: if isinstance(image, PIL.Image.Image): height = image.height - else: + elif isinstance(image, torch.Tensor): height = image.shape[2] + else: + height = image.shape[1] if width is None: if isinstance(image, PIL.Image.Image): width = image.width - else: + elif isinstance(image, torch.Tensor): width = image.shape[3] + else: + height = image.shape[2] width, height = ( x - x % self.config.vae_scale_factor for x in (width, height) @@ -184,8 +188,6 @@ def preprocess( """ supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - height, width = self.get_default_height_width(image, height, width) - # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, # it could have 2 possible shapes: # 1. batch x height x width: we should insert the channel dimension at position 1 @@ -211,6 +213,7 @@ def preprocess( elif self.config.do_convert_grayscale: image = [self.convert_to_grayscale(i) for i in image] if self.config.do_resize: + height, width = self.get_default_height_width(image[0], height, width) image = [self.resize(i, height, width) for i in image] image = self.pil_to_numpy(image) # to np image = self.numpy_to_pt(image) # to pt @@ -220,6 +223,7 @@ def preprocess( image = self.numpy_to_pt(image) + height, width = self.get_default_height_width(image, height, width) if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width): raise ValueError( f"Currently we only support resizing for PIL image - please resize your numpy array to be {height} and {width}" @@ -237,6 +241,7 @@ def preprocess( if channel == 4: return image + height, width = self.get_default_height_width(image, height, width) if self.config.do_resize and (image.shape[2] != height or image.shape[3] != width): raise ValueError( f"Currently we only support resizing for PIL image - please resize your torch tensor to be {height} and {width}" From 4e46ea18161707711d02b15d5796ff06d9f35c86 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 3 Aug 2023 03:11:21 +0000 Subject: [PATCH 07/23] improve docstring --- src/diffusers/image_processor.py | 48 ++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index fe05c851003a..5b0f9e689f6b 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -127,7 +127,7 @@ def denormalize(images): @staticmethod def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: """ - Converts an image to RGB format. + Converts a PIL image to RGB format. """ image = image.convert("RGB") @@ -136,13 +136,33 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: @staticmethod def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image: """ - Converts an image to L mode. + Converts a PIL image to grayscale format. """ image = image.convert("L") return image - def get_default_height_width(self, image, height, width): + def get_default_height_width( + self, + image: [PIL.Image.Image, np.ndarray, torch.Tensor], + height: Optional[int] = None, + width: Optional[int] = None, + ): + """ + This function return the height and width that are downscaled to the next integer multiple of + `vae_scale_factor`. + + Args: + image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`): + the image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have + shape [batch, height, width] or [batch, height, width, channel] if it is a pytorch tensor, should have + shape [batch, channel, height, width] + height (`int`, *optional*, defaults to `None`): + The height in preprocessed image. If `None`, will use the height of `image` input + width (`int`, *optional*`, defaults to `None`): + The width in preprocessed. If `None`, will use the width of the `image` input + """ + if height is None: if isinstance(image, PIL.Image.Image): height = image.height @@ -172,7 +192,7 @@ def resize( width: Optional[int] = None, ) -> PIL.Image.Image: """ - Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`. + Resize a PIL image. """ image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) return image @@ -188,17 +208,23 @@ def preprocess( """ supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - # If the image input is a 3-dimensional pytorch tensor or numpy array that represent images in grayscale format, - # it could have 2 possible shapes: - # 1. batch x height x width: we should insert the channel dimension at position 1 - # 2. channnel x height x width: we should insert batch dimension at position 0, - # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 - # for simplicity, we insert a dimension of size 1 at position 1 for both cases + # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: if isinstance(image, torch.Tensor): + # if image is a pytorch tensor could have 2 possible shapes: + # 1. batch x height x width: we should insert the channel dimension at position 1 + # 2. channnel x height x width: we should insert batch dimension at position 0, + # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 + # for simplicity, we insert a dimension of size 1 at position 1 for both cases image = image.unsqueeze(1) else: - image = np.expand_dims(image, axis=1) + # if it is a numpy array, it could have 2 possible shapes: + # 1. batch x height x width: insert channel dimension on last position + # 2. height x width x channel: insert batch dimension on first position + if image.shape[-1] == 1: + image = np.expand_dims(image, axis=0) + else: + image = np.expand_dims(image, axis=-1) if isinstance(image, supported_formats): image = [image] From 3f5e0467cddc626cdb247871e293fe32a490cc58 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 4 Aug 2023 06:52:37 +0000 Subject: [PATCH 08/23] add do_binarize and warning -> error --- src/diffusers/image_processor.py | 20 +++++++++++++++---- .../pipeline_stable_diffusion_inpaint.py | 5 +---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 5b0f9e689f6b..d18c538c4b00 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -51,15 +51,16 @@ def __init__( vae_scale_factor: int = 8, resample: str = "lanczos", do_normalize: bool = True, + do_binarize: bool = False, do_convert_rgb: bool = False, do_convert_grayscale: bool = False, ): super().__init__() if do_convert_rgb and do_convert_grayscale: - warnings.warn( - "`do_convert_rgb = True` will be ignored since `do_convert_grayscale` is also set to be `True`," - " if you intended to convert the image into RGB format, please set `do_convert_grayscale =False`.", - FutureWarning, + raise ValueError( + "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`," + " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.", + " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`", ) self.config.do_convert_rgb = False @@ -197,6 +198,14 @@ def resize( image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) return image + def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image: + """ + create a mask + """ + image[image < 0.5] = 0 + image[image >= 0.5] = 1 + return image + def preprocess( self, image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], @@ -287,6 +296,9 @@ def preprocess( if do_normalize: image = self.normalize(image) + if self.config.do_binarize: + image = self.binarize(image) + return image def postprocess( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 33de1528dfd4..cde7ea949dca 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -286,7 +286,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.mask_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_convert_grayscale=True + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True ) self.register_to_config(requires_safety_checker=requires_safety_checker) @@ -871,9 +871,6 @@ def __call__( init_image = init_image.to(dtype=torch.float32) mask = self.mask_processor.preprocess(mask_image, height=height, width=width) - # binarize mask - mask[mask < 0.5] = 0 - mask[mask > 0.5] = 1 masked_image = init_image * (mask < 0.5) From ccbfcabf34b17477771980da8a4ee47792250a23 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 4 Aug 2023 08:21:38 +0000 Subject: [PATCH 09/23] add tests --- tests/others/test_image_processor.py | 144 ++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/tests/others/test_image_processor.py b/tests/others/test_image_processor.py index c2cd6f4a04f4..f8d22ce9a01b 100644 --- a/tests/others/test_image_processor.py +++ b/tests/others/test_image_processor.py @@ -34,6 +34,17 @@ def dummy_sample(self): return sample + @property + def dummy_mask(self): + batch_size = 1 + num_channels = 1 + height = 8 + width = 8 + + sample = torch.rand((batch_size, num_channels, height, width)) + + return sample + def to_np(self, image): if isinstance(image[0], PIL.Image.Image): return np.stack([np.array(i) for i in image], axis=0) @@ -133,17 +144,144 @@ def test_preprocess_input_list(self): ) input_np_4d = self.to_np(self.dummy_sample) - list(input_np_4d) + input_np_list = list(input_np_4d) out_np_4d = image_processor.postprocess( - image_processor.preprocess(input_pt_4d), + image_processor.preprocess(input_np_4d), output_type="np", ) out_np_list = image_processor.postprocess( - image_processor.preprocess(input_pt_list), + image_processor.preprocess(input_np_list), output_type="np", ) assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6 assert np.abs(out_np_4d - out_np_list).max() < 1e-6 + + def test_preprocess_input_mask_3d(self): + image_processor = VaeImageProcessor( + do_resize=False, do_normalize=False, do_binarize=True, do_convert_grayscale=True + ) + + input_pt_4d = self.dummy_mask + input_pt_3d = input_pt_4d.squeeze(0) + input_pt_2d = input_pt_3d.squeeze(0) + + out_pt_4d = image_processor.postprocess( + image_processor.preprocess(input_pt_4d), + output_type="np", + ) + out_pt_3d = image_processor.postprocess( + image_processor.preprocess(input_pt_3d), + output_type="np", + ) + + out_pt_2d = image_processor.postprocess( + image_processor.preprocess(input_pt_2d), + output_type="np", + ) + + input_np_4d = self.to_np(self.dummy_mask) + input_np_3d = input_np_4d.squeeze(0) + input_np_3d_1 = input_np_4d.squeeze(-1) + input_np_2d = input_np_3d.squeeze(-1) + + out_np_4d = image_processor.postprocess( + image_processor.preprocess(input_np_4d), + output_type="np", + ) + out_np_3d = image_processor.postprocess( + image_processor.preprocess(input_np_3d), + output_type="np", + ) + + out_np_3d_1 = image_processor.postprocess( + image_processor.preprocess(input_np_3d_1), + output_type="np", + ) + + out_np_2d = image_processor.postprocess( + image_processor.preprocess(input_np_2d), + output_type="np", + ) + + assert np.abs(out_pt_4d - out_pt_3d).max() == 0 + assert np.abs(out_pt_4d - out_pt_2d).max() == 0 + assert np.abs(out_np_4d - out_np_3d).max() == 0 + assert np.abs(out_np_4d - out_np_3d_1).max() == 0 + assert np.abs(out_np_4d - out_np_2d).max() == 0 + + def test_preprocess_input_mask_list(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True) + + input_pt_4d = self.dummy_mask + input_pt_3d = input_pt_4d.squeeze(0) + input_pt_2d = input_pt_3d.squeeze(0) + + inputs_pt = [input_pt_4d, input_pt_3d, input_pt_2d] + inputs_pt_list = [[input_pt] for input_pt in inputs_pt] + + for input_pt, input_pt_list in zip(inputs_pt, inputs_pt_list): + out_pt = image_processor.postprocess( + image_processor.preprocess(input_pt), + output_type="np", + ) + out_pt_list = image_processor.postprocess( + image_processor.preprocess(input_pt_list), + output_type="np", + ) + assert np.abs(out_pt - out_pt_list).max() < 1e-6 + + input_np_4d = self.to_np(self.dummy_mask) + input_np_3d = input_np_4d.squeeze(0) + input_np_2d = input_np_3d.squeeze(-1) + + inputs_np = [input_np_4d, input_np_3d, input_np_2d] + inputs_np_list = [[input_np] for input_np in inputs_np] + + for input_np, input_np_list in zip(inputs_np, inputs_np_list): + out_np = image_processor.postprocess( + image_processor.preprocess(input_np), + output_type="np", + ) + out_np_list = image_processor.postprocess( + image_processor.preprocess(input_np_list), + output_type="np", + ) + assert np.abs(out_np - out_np_list).max() < 1e-6 + + def test_preprocess_input_mask_3d_batch(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=False, do_convert_grayscale=True) + + # create a dummy mask input with batch_size 2 + dummy_mask_batch = torch.cat([self.dummy_mask] * 2, axis=0) + + # squeeze out the channel dimension + input_pt_3d = dummy_mask_batch.squeeze(1) + input_np_3d = self.to_np(dummy_mask_batch).squeeze(-1) + + input_pt_3d_list = list(input_pt_3d) + input_np_3d_list = list(input_np_3d) + + out_pt_3d = image_processor.postprocess( + image_processor.preprocess(input_pt_3d), + output_type="np", + ) + out_pt_3d_list = image_processor.postprocess( + image_processor.preprocess(input_pt_3d_list), + output_type="np", + ) + + assert np.abs(out_pt_3d - out_pt_3d_list).max() < 1e-6 + + out_np_3d = image_processor.postprocess( + image_processor.preprocess(input_np_3d), + output_type="np", + ) + out_np_3d_list = image_processor.postprocess( + image_processor.preprocess(input_np_3d_list), + output_type="np", + ) + + assert np.abs(out_np_3d - out_np_3d_list).max() < 1e-6 From 6aa4114a9b25ee755eddb5646c954e707a86c1ae Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 4 Aug 2023 21:25:36 +0000 Subject: [PATCH 10/23] apply feedback --- src/diffusers/image_processor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index d18c538c4b00..30969dac599c 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -155,13 +155,13 @@ def get_default_height_width( Args: image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`): - the image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have - shape [batch, height, width] or [batch, height, width, channel] if it is a pytorch tensor, should have - shape [batch, channel, height, width] + The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have + shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should + have shape `[batch, channel, height, width]`. height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the height of `image` input + The height in preprocessed image. If `None`, will use the height of `image` input. width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use the width of the `image` input + The width in preprocessed. If `None`, will use the width of the `image` input. """ if height is None: From 8d7c0916a95baab8f98e8de6ac1af977795767c1 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 4 Aug 2023 21:26:36 +0000 Subject: [PATCH 11/23] fix copies --- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 0c5e0b6d726e..854b4bf2cbcf 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -133,7 +133,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ - + warnings.warn( + "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if image is None: raise ValueError("`image` input cannot be undefined.") From 0f09e72bc81152adfe4a48f9128c92bdb1e9ce00 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 4 Aug 2023 21:30:05 +0000 Subject: [PATCH 12/23] docstring --- src/diffusers/image_processor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 30969dac599c..dfb06f50bdc4 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -38,8 +38,12 @@ class VaeImageProcessor(ConfigMixin): Resampling filter to use when resizing the image. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image to [-1,1]. + do_binarize (`bool`, *optional*, defaults to `True`): + Whether to binarize the image to 0/1. do_convert_rgb (`bool`, *optional*, defaults to be `False`): Whether to convert the images to RGB format. + do_convert_grayscale (`bool`, *optional*, defaults to be `False`): + Whether to convert the images to grayscale format. """ config_name = CONFIG_NAME From 5a86b88a3a8d5f3a4c767e1752ed857ede580b66 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 5 Aug 2023 03:31:37 +0000 Subject: [PATCH 13/23] refactor controlnet inpaint --- .../controlnet/pipeline_controlnet_inpaint.py | 87 ++++++++++--------- .../pipeline_stable_diffusion_img2img.py | 7 +- .../pipeline_stable_diffusion_inpaint.py | 35 ++++++-- 3 files changed, 79 insertions(+), 50 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 854b4bf2cbcf..a38327bf637c 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -320,6 +320,9 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True + ) self.control_image_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False ) @@ -612,7 +615,7 @@ def check_inputs( control_guidance_start=0.0, control_guidance_end=1.0, ): - if height % 8 != 0 or width % 8 != 0: + if height is not None and height % 8 != 0 or width is not None and width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( @@ -860,31 +863,6 @@ def prepare_latents( return outputs - def _default_height_width(self, height, width, image): - # NOTE: It is possible that a list of images have different - # dimensions for each image, so just checking the first image - # is not _exactly_ correct, but it is simple. - while isinstance(image, list): - image = image[0] - - if height is None: - if isinstance(image, PIL.Image.Image): - height = image.height - elif isinstance(image, torch.Tensor): - height = image.shape[2] - - height = (height // 8) * 8 # round down to nearest multiple of 8 - - if width is None: - if isinstance(image, PIL.Image.Image): - width = image.width - elif isinstance(image, torch.Tensor): - width = image.shape[3] - - width = (width // 8) * 8 # round down to nearest multiple of 8 - - return height, width - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents def prepare_mask_latents( self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance @@ -947,8 +925,22 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.Tensor, PIL.Image.Image] = None, - mask_image: Union[torch.Tensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, + mask_image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, control_image: Union[ torch.FloatTensor, PIL.Image.Image, @@ -986,13 +978,27 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked + out with `mask_image` and repainted according to `prompt`). + For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` + If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. + If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` + It can also accept image latents as `image`, but if passing latents directly it is not encoded again. + mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted + while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel + (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, + so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array + would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. + control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): - The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can - also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If - height and/or width are passed, `image` is resized according to them. If multiple ControlNets are - specified in init, images must be passed as a list such that each element of the list can be correctly + The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. + The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, + `image` is resized according to them. If multiple ControlNets are specified in init, + images must be passed as a list such that each element of the list can be correctly batched for input to a single controlnet. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -1077,9 +1083,6 @@ def __call__( """ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet - # 0. Default height and width to unet - height, width = self._default_height_width(height, width, image) - # align format for control guidance if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): control_guidance_start = len(control_guidance_end) * [control_guidance_start] @@ -1181,9 +1184,13 @@ def __call__( assert False # 4. Preprocess mask and image - resizes image and mask w.r.t height and width - mask, masked_image, init_image = prepare_mask_and_masked_image( - image, mask_image, height, width, return_image=True - ) + init_image = self.image_processor.preprocess(image, height=height, width=width) + init_image = init_image.to(dtype=torch.float32) + + mask = self.mask_processor.preprocess(mask_image, height=height, width=width) + + masked_image = init_image * (mask < 0.5) + _, _, height, width = init_image.shape # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e6c3a6ae4d69..27499b3e11fc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -596,8 +596,11 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image` or tensor representing an image batch to be used as the starting point. Can also accept image - latents as `image`, but if passing latents directly it is not encoded again. + `Image`, numpy array or tensor representing an image batch to be used as the starting point. + For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` + If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. + If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` + It can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index cde7ea949dca..60b81b032583 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -680,8 +680,22 @@ def get_timesteps(self, num_inference_steps, strength, device): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, + mask_image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, @@ -706,14 +720,19 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`PIL.Image.Image`): - `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked out with `mask_image` and repainted according to `prompt`). - mask_image (`PIL.Image.Image`): - `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted + For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` + If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. + If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` + It can also accept image latents as `image`, but if passing latents directly it is not encoded again. + mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel - (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the - expected shape would be `(B, H, W, 1)`. + (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, + so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array + would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): From 12cf87e6a243cf4fe60fe9e2324b5b958a0ebe84 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 5 Aug 2023 03:38:14 +0000 Subject: [PATCH 14/23] style --- src/diffusers/image_processor.py | 2 +- .../controlnet/pipeline_controlnet_inpaint.py | 37 ++++++++++--------- .../pipeline_stable_diffusion_img2img.py | 10 ++--- .../pipeline_stable_diffusion_inpaint.py | 23 ++++++------ 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index dfb06f50bdc4..dda5182e1ed7 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -289,7 +289,7 @@ def preprocess( # expected range [0,1], normalize to [-1,1] do_normalize = self.config.do_normalize - if image.min() < 0: + if image.min() < 0 and do_normalize: warnings.warn( "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] " f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]", diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index a38327bf637c..0590411a1f2d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -978,28 +978,29 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked - out with `mask_image` and repainted according to `prompt`). - For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` - If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. - If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` - It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to + be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch + tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the + expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the + expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but + if passing latents directly it is not encoded again. + mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted - while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel - (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, - so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array - would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a + single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one + color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, + 1)`, or `(H, W)`. control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): - The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. - The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, - `image` is resized according to them. If multiple ControlNets are specified in init, - images must be passed as a list such that each element of the list can be correctly - batched for input to a single controlnet. + The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. The + dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, + `image` is resized according to them. If multiple ControlNets are specified in init, images must be + passed as a list such that each element of the list can be correctly batched for input to a single + controlnet. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 27499b3e11fc..355910747637 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -596,11 +596,11 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to be used as the starting point. - For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` - If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. - If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` - It can also accept image latents as `image`, but if passing latents directly it is not encoded again. + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 60b81b032583..bc3ad52e8d4b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -721,18 +721,19 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked - out with `mask_image` and repainted according to `prompt`). - For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` - If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. - If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` - It can also accept image latents as `image`, but if passing latents directly it is not encoded again. + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to + be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch + tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the + expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the + expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but + if passing latents directly it is not encoded again. mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted - while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel - (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, - so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array - would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a + single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one + color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, + 1)`, or `(H, W)`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): From 05ab579676eb3039fc6b18267f523d041f87dc98 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 5 Aug 2023 03:44:36 +0000 Subject: [PATCH 15/23] fix copies --- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 2c2fa927be8c..4cff3c9c2bfc 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -590,7 +590,10 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image` or tensor representing an image batch to be used as the starting point. Can also accept image + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a From b813ee6c93ba9f44e3602eb40b159094cbd95cf2 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 01:03:17 +0000 Subject: [PATCH 16/23] add ImageInput type --- src/diffusers/image_processor.py | 10 +++++++ .../pipeline_alt_diffusion_img2img.py | 11 ++----- .../controlnet/pipeline_controlnet.py | 11 ++----- .../controlnet/pipeline_controlnet_img2img.py | 20 ++----------- .../controlnet/pipeline_controlnet_inpaint.py | 29 +++---------------- .../controlnet/pipeline_controlnet_sd_xl.py | 11 ++----- .../pipeline_cycle_diffusion.py | 11 ++----- .../pipeline_stable_diffusion_depth2img.py | 11 ++----- .../pipeline_stable_diffusion_img2img.py | 11 ++----- .../pipeline_stable_diffusion_inpaint.py | 20 ++----------- ...eline_stable_diffusion_instruct_pix2pix.py | 11 ++----- ...ipeline_stable_diffusion_latent_upscale.py | 11 ++----- .../pipeline_stable_diffusion_pix2pix_zero.py | 11 ++----- .../pipeline_stable_diffusion_upscale.py | 11 ++----- .../pipeline_stable_diffusion_xl_img2img.py | 12 ++------ 15 files changed, 42 insertions(+), 159 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index dda5182e1ed7..a298e7b50ad8 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -24,6 +24,16 @@ from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate +ImageInput = Union[ + "PIL.Image.Image", + np.ndarray, + "torch.FloatTensor", + List["PIL.Image.Image"], + List[np.ndarray], + List["torch.FloatTensor"], +] + + class VaeImageProcessor(ConfigMixin): """ Image processor for VAE. diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 4cff3c9c2bfc..6d164baeef96 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -25,7 +25,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -560,14 +560,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 4320240dac63..7d11e3da6a07 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -23,7 +23,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -665,14 +665,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index db57d556ad26..24d0acacac20 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -23,7 +23,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -743,22 +743,8 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, - control_image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, + control_image: ImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 0590411a1f2d..1497d65fb3a9 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -24,7 +24,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -925,30 +925,9 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, - mask_image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, - control_image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, + mask_image: ImageInput = None, + control_image: ImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 29d153ba0485..e07866095f20 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -24,7 +24,7 @@ from diffusers.utils.import_utils import is_invisible_watermark_available -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...models.attention_processor import ( @@ -648,14 +648,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 29fd5a2df3d0..c22ab19f21fa 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -25,7 +25,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler @@ -571,14 +571,7 @@ def __call__( self, prompt: Union[str, List[str]], source_prompt: Union[str, List[str]], - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index b976482a9165..96119971dbcc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -24,7 +24,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -492,14 +492,7 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, depth_map: Optional[torch.FloatTensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 355910747637..66eb9e728e3f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -566,14 +566,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index bc3ad52e8d4b..4990396879d4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -680,22 +680,8 @@ def get_timesteps(self, num_inference_steps, strength, device): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, - mask_image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, + mask_image: ImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index d27f8a21f369..02ebfcc54297 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -147,14 +147,7 @@ def __init__( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, num_inference_steps: int = 100, guidance_scale: float = 7.5, image_guidance_scale: float = 1.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index cad82cb71940..fbc4a537c2e0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -21,7 +21,7 @@ import torch.nn.functional as F from transformers import CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import EulerDiscreteScheduler from ...utils import logging, randn_tensor @@ -257,14 +257,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]], - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 960c4369e45a..b3df3785d147 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -29,7 +29,7 @@ CLIPTokenizer, ) -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention @@ -1056,14 +1056,7 @@ def __call__( def invert( self, prompt: Optional[str] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 582bf6223d44..0542b254df52 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -489,14 +489,7 @@ def upcast_vae(self): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, noise_level: int = 20, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index e69e4bc74d43..e14b26976fab 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -15,12 +15,11 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Tuple, Union -import numpy as np import PIL.Image import torch from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import ImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -656,14 +655,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: ImageInput = None, strength: float = 0.3, num_inference_steps: int = 50, denoising_start: Optional[float] = None, From 8a225bce875f6f9c9574c21ef366efa455d93dfa Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 01:12:53 +0000 Subject: [PATCH 17/23] fix --- src/diffusers/image_processor.py | 10 +++++----- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 4 ++-- .../pipelines/controlnet/pipeline_controlnet.py | 4 ++-- .../controlnet/pipeline_controlnet_img2img.py | 6 +++--- .../controlnet/pipeline_controlnet_inpaint.py | 8 ++++---- .../pipelines/controlnet/pipeline_controlnet_sd_xl.py | 4 ++-- .../stable_diffusion/pipeline_cycle_diffusion.py | 4 ++-- .../pipeline_stable_diffusion_depth2img.py | 4 ++-- .../pipeline_stable_diffusion_img2img.py | 4 ++-- .../pipeline_stable_diffusion_inpaint.py | 6 +++--- .../pipeline_stable_diffusion_instruct_pix2pix.py | 4 ++-- .../pipeline_stable_diffusion_latent_upscale.py | 4 ++-- .../pipeline_stable_diffusion_pix2pix_zero.py | 4 ++-- .../pipeline_stable_diffusion_upscale.py | 4 ++-- .../pipeline_stable_diffusion_xl_img2img.py | 4 ++-- 15 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index a298e7b50ad8..097257c1a5a8 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -24,13 +24,13 @@ from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate -ImageInput = Union[ - "PIL.Image.Image", +PipelineImageInput = Union[ + PIL.Image.Image, np.ndarray, - "torch.FloatTensor", - List["PIL.Image.Image"], + torch.FloatTensor, + List[PIL.Image.Image], List[np.ndarray], - List["torch.FloatTensor"], + List[torch.FloatTensor], ] diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 6d164baeef96..0d8a34425fef 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -25,7 +25,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -560,7 +560,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 7d11e3da6a07..58ae74149134 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -23,7 +23,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -665,7 +665,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 24d0acacac20..4ed5d25b5bd5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -23,7 +23,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -743,8 +743,8 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, - control_image: ImageInput = None, + image: PipelineImageInput = None, + control_image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 1497d65fb3a9..977c1bb0f80c 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -24,7 +24,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -925,9 +925,9 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, - mask_image: ImageInput = None, - control_image: ImageInput = None, + image: PipelineImageInput = None, + mask_image: PipelineImageInput = None, + control_image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index e07866095f20..fcc92a066e21 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -24,7 +24,7 @@ from diffusers.utils.import_utils import is_invisible_watermark_available -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...models.attention_processor import ( @@ -648,7 +648,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index c22ab19f21fa..cc64a4a019a2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -25,7 +25,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler @@ -571,7 +571,7 @@ def __call__( self, prompt: Union[str, List[str]], source_prompt: Union[str, List[str]], - image: ImageInput = None, + image: PipelineImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 96119971dbcc..53dc28397933 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -24,7 +24,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation from ...configuration_utils import FrozenDict -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -492,7 +492,7 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, depth_map: Optional[torch.FloatTensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 66eb9e728e3f..f01cc0758e36 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -566,7 +566,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 4990396879d4..c31d78ab50fb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -680,8 +680,8 @@ def get_timesteps(self, num_inference_steps, strength, device): def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, - mask_image: ImageInput = None, + image: PipelineImageInput = None, + mask_image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 02ebfcc54297..3253c135d6e6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -147,7 +147,7 @@ def __init__( def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, num_inference_steps: int = 100, guidance_scale: float = 7.5, image_guidance_scale: float = 1.5, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index fbc4a537c2e0..79501a78cdd1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -21,7 +21,7 @@ import torch.nn.functional as F from transformers import CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import EulerDiscreteScheduler from ...utils import logging, randn_tensor @@ -257,7 +257,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]], - image: ImageInput = None, + image: PipelineImageInput = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index b3df3785d147..de42093478d8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -29,7 +29,7 @@ CLIPTokenizer, ) -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention @@ -1056,7 +1056,7 @@ def __call__( def invert( self, prompt: Optional[str] = None, - image: ImageInput = None, + image: PipelineImageInput = None, num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 0542b254df52..d2e61a1b7666 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -489,7 +489,7 @@ def upcast_vae(self): def __call__( self, prompt: Union[str, List[str]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, noise_level: int = 20, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index e14b26976fab..e4364ee1679d 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -19,7 +19,7 @@ import torch from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer -from ...image_processor import ImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -655,7 +655,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - image: ImageInput = None, + image: PipelineImageInput = None, strength: float = 0.3, num_inference_steps: int = 50, denoising_start: Optional[float] = None, From 7c920aca6cc3503312eed030d3548b7d839c5fc7 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 01:38:48 +0000 Subject: [PATCH 18/23] warning -> deprecate --- .../controlnet/pipeline_controlnet_inpaint.py | 12 ++++++++---- .../pipeline_stable_diffusion_inpaint.py | 12 +++++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 977c1bb0f80c..fdcf4e2e37db 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -29,6 +29,7 @@ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( + deprecate, is_accelerate_available, is_accelerate_version, is_compiled_module, @@ -133,10 +134,13 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ - warnings.warn( - "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor.preprocess instead", - FutureWarning, + deprecate( + "prepare_mask_and_masked_image", + "0.21.0", + message=( + "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + ), ) if image is None: raise ValueError("`image` input cannot be undefined.") diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index c31d78ab50fb..034f3fd3d972 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -13,7 +13,6 @@ # limitations under the License. import inspect -import warnings from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -64,10 +63,13 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ - warnings.warn( - "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor.preprocess instead", - FutureWarning, + deprecate( + "prepare_mask_and_masked_image", + "0.21.0", + message=( + "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + ), ) if image is None: raise ValueError("`image` input cannot be undefined.") From 317b13012f30eab66a1eb9e6112df424802f5d2c Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 02:03:45 +0000 Subject: [PATCH 19/23] fix --- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 6 ++---- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index fdcf4e2e37db..6d01c3d22e26 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -134,13 +134,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ + deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" deprecate( "prepare_mask_and_masked_image", "0.21.0", - message=( - "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor.preprocess instead", - ), + deprecation_message, ) if image is None: raise ValueError("`image` input cannot be undefined.") diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 034f3fd3d972..0d5f7012e981 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -63,13 +63,11 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 dimensions: ``batch x channels x height x width``. """ + deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" deprecate( "prepare_mask_and_masked_image", "0.21.0", - message=( - "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please" - " use VaeImageProcessor.preprocess instead", - ), + deprecation_message, ) if image is None: raise ValueError("`image` input cannot be undefined.") From 280709f2bd1ec4e65b72cc48d378859d9f5d98ab Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 02:33:10 +0000 Subject: [PATCH 20/23] refator sdxl-inpaint --- .../pipeline_stable_diffusion_xl_inpaint.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 8b96b558ec7c..112dec518421 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -20,7 +20,7 @@ import torch from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -139,6 +139,12 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool """ # checkpoint. TOD(Yiyi) - need to clean this up later + deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" + deprecate( + "prepare_mask_and_masked_image", + "0.21.0", + deprecation_message, + ) if image is None: raise ValueError("`image` input cannot be undefined.") @@ -292,6 +298,9 @@ def __init__( self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True + ) add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() @@ -859,8 +868,8 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: PipelineImageInput = None, + mask_image: PipelineImageInput = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 1.0, @@ -1100,9 +1109,12 @@ def denoising_value_valid(dnv): is_strength_max = strength == 1.0 # 5. Preprocess mask and image - mask, masked_image, init_image = prepare_mask_and_masked_image( - image, mask_image, height, width, return_image=True - ) + init_image = self.image_processor.preprocess(image, height=height, width=width) + init_image = init_image.to(dtype=torch.float32) + + mask = self.mask_processor.preprocess(mask_image, height=height, width=width) + + masked_image = init_image * (mask < 0.5) # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels From 51442dcec460a4145838d518c7245f36c6c272a0 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 03:16:14 +0000 Subject: [PATCH 21/23] fix image latent --- .../pipeline_stable_diffusion_xl_inpaint.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 112dec518421..78f75dcffac8 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -1114,7 +1114,11 @@ def denoising_value_valid(dnv): mask = self.mask_processor.preprocess(mask_image, height=height, width=width) - masked_image = init_image * (mask < 0.5) + if init_image.shape[1] ==4: + # if images are in latent space, we can't mask it + masked_image = None + else: + masked_image = init_image * (mask < 0.5) # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels From c24c2ad24ae9391139c35667316a0f1d753fa7a7 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 03:18:01 +0000 Subject: [PATCH 22/23] style --- .../pipeline_stable_diffusion_xl_inpaint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 78f75dcffac8..42464d79427e 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -31,6 +31,7 @@ ) from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( + deprecate, is_accelerate_available, is_accelerate_version, is_invisible_watermark_available, @@ -1114,7 +1115,7 @@ def denoising_value_valid(dnv): mask = self.mask_processor.preprocess(mask_image, height=height, width=width) - if init_image.shape[1] ==4: + if init_image.shape[1] == 4: # if images are in latent space, we can't mask it masked_image = None else: From c1719124e644bd1542ba322b8fb2fa7a2881f459 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 24 Aug 2023 03:41:30 +0000 Subject: [PATCH 23/23] fix --- .../controlnet/pipeline_controlnet_inpaint.py | 2 +- .../pipeline_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion_xl_inpaint.py | 2 +- .../pipeline_stable_diffusion_xl_instruct_pix2pix.py | 12 ++---------- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 6d01c3d22e26..145ff9714c78 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -137,7 +137,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" deprecate( "prepare_mask_and_masked_image", - "0.21.0", + "0.30.0", deprecation_message, ) if image is None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 0d5f7012e981..67c232b143c6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -66,7 +66,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" deprecate( "prepare_mask_and_masked_image", - "0.21.0", + "0.30.0", deprecation_message, ) if image is None: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 42464d79427e..c08f3f1d5993 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -143,7 +143,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead" deprecate( "prepare_mask_and_masked_image", - "0.21.0", + "0.30.0", deprecation_message, ) if image is None: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index eec5f840277a..3b9bf1a17671 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -15,12 +15,11 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Tuple, Union -import numpy as np import PIL.Image import torch from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer -from ...image_processor import VaeImageProcessor +from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( @@ -588,14 +587,7 @@ def upcast_vae(self): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[ - torch.FloatTensor, - PIL.Image.Image, - np.ndarray, - List[torch.FloatTensor], - List[PIL.Image.Image], - List[np.ndarray], - ] = None, + image: PipelineImageInput = None, num_inference_steps: int = 100, guidance_scale: float = 7.5, image_guidance_scale: float = 1.5,