From 7ca87594e23effbf34447ccc3139ad2b98850476 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 5 May 2024 12:00:55 +0200 Subject: [PATCH 1/7] update --- src/diffusers/image_processor.py | 69 +++++++++++++++------ src/diffusers/video_processor.py | 91 ++++++---------------------- tests/others/test_video_processor.py | 4 +- 3 files changed, 71 insertions(+), 93 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 4ccb9d77d627..660368e98c38 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -29,15 +29,31 @@ PipelineImageInput = Union[ PIL.Image.Image, np.ndarray, - torch.FloatTensor, + torch.Tensor, List[PIL.Image.Image], List[np.ndarray], - List[torch.FloatTensor], + List[torch.Tensor], ] PipelineDepthInput = PipelineImageInput +def is_valid_image(image): + return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3) + + +def is_valid_image_input(images): + # check if the image input is one of the supported formats: + # it can be either a 4d pytorch tensor or numpy array, a valid image or a list of valid image + if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4: + return True + elif is_valid_image(images): + return True + elif isinstance(images, list): + return all(is_valid_image(image) for image in images) + return False + + class VaeImageProcessor(ConfigMixin): """ Image processor for VAE. @@ -111,7 +127,7 @@ def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.nd return images @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: + def numpy_to_pt(images: np.ndarray) -> torch.Tensor: """ Convert a NumPy image to a PyTorch tensor. """ @@ -122,7 +138,7 @@ def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: return images @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: + def pt_to_numpy(images: torch.Tensor) -> np.ndarray: """ Convert a PyTorch tensor to a NumPy image. """ @@ -498,9 +514,24 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) - if isinstance(image, supported_formats): + if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as list of 4-dimensional numpy array is deprecated." + "The expected numpy array input format for multiple images are either a single 4-d array or a list of 3-d arrays." + ) + image = np.concatenate(image, axis=0) + if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as list of 4-dimensional pytorch tensor is deprecated." + "The expected pytorch tensor input format for multiple images are either a single 4-d tensor or a list of 3-d tensors." + ) + image = torch.cat(image, axis=0) + + if isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim == 4: + image = list(image) + if is_valid_image(image): image = [image] - elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): + if not all(is_valid_image(img) for img in image): raise ValueError( f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" ) @@ -562,15 +593,15 @@ def preprocess( def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -579,7 +610,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -739,15 +770,15 @@ def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]: def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -756,7 +787,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -794,8 +825,8 @@ def postprocess( def preprocess( self, - rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + rgb: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + depth: Union[torch.Tensor, PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, target_res: Optional[int] = None, @@ -934,13 +965,13 @@ def __init__( ) @staticmethod - def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int): + def downsample(mask: torch.Tensor, batch_size: int, num_queries: int, value_embed_dim: int): """ Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. Args: - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`. batch_size (`int`): The batch size. @@ -950,7 +981,7 @@ def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value The dimensionality of the value embeddings. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The downsampled mask tensor. """ diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index ece646eccf6d..f375a9d0ebb2 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -18,20 +18,20 @@ import PIL import torch -from .image_processor import VaeImageProcessor +from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_input class VideoProcessor(VaeImageProcessor): r"""Simple video processor.""" def tensor2vid( - self, video: torch.FloatTensor, output_type: str = "np" - ) -> Union[np.ndarray, torch.FloatTensor, List[PIL.Image.Image]]: + self, video: torch.Tensor, output_type: str = "np" + ) -> Union[np.ndarray, torch.Tensor, List[PIL.Image.Image]]: r""" Converts a video tensor to a list of frames for export. Args: - video (`torch.FloatTensor`): The video as a tensor. + video (`torch.Tensor`): The video as a tensor. output_type (`str`, defaults to `"np"`): Output type of the postprocessed `video` tensor. """ batch_size = video.shape[0] @@ -50,7 +50,7 @@ def tensor2vid( return outputs - def preprocess_video(self, video) -> torch.FloatTensor: + def preprocess_video(self, video) -> torch.Tensor: r""" Preprocesses input video(s). @@ -58,86 +58,33 @@ def preprocess_video(self, video) -> torch.FloatTensor: video: The input video. It can be one of the following: * List of the PIL images. * List of list of PIL images. + * 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)). + * 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)). * List of 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)). - * List of list of 4D Torch tensors (expected shape for tensor: (num_frames, num_channels, height, - width)). * List of 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)). - * List of list of 4D NumPy arrays (expected shape for each array: (num_frames, height, width, - num_channels)). - * List of 5D NumPy arrays (expected shape for each array: (batch_size, num_frames, height, width, - num_channels). - * List of 5D Torch tensors (expected shape for each array: (batch_size, num_frames, num_channels, - height, width). * 5D NumPy arrays: expected shape for each array: (batch_size, num_frames, height, width, num_channels). * 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height, width). """ - supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image, list) - # Single-frame video. - if isinstance(video, supported_formats[:-1]): + # make sure video is either a list of 4-d array or a list of list images + if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: + video = list(video) + elif isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 4: video = [video] - - # List of PIL images. - elif isinstance(video, list) and isinstance(video[0], PIL.Image.Image): + elif is_valid_image(video): + video = [[video]] + elif isinstance(video, list) and is_valid_image(video[0]): video = [video] - - elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)): + elif isinstance(video, list) and is_valid_image_input(video[0]): + video = video + else: raise ValueError( - f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(list(map(str, supported_formats)))}" + "Input is in incorrect format. Currently, we only support numpy.ndarray, torch.Tensor, PIL.Image.Image" ) - if isinstance(video[0], np.ndarray): - # When the number of dimension of the first element in `video` is 5, it means - # each element in the `video` list is a video. - video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0) - - if video.ndim == 4: - video = video[None, ...] - - elif isinstance(video[0], torch.Tensor): - video = torch.cat(video, dim=0) if video[0].ndim == 5 else torch.stack(video, dim=0) - - # don't need any preprocess if the video is latents - channel = video.shape[1] - if channel == 4: - return video - - # List of 5d tensors/ndarrays. - elif isinstance(video[0], list): - if isinstance(video[0][0], (np.ndarray, torch.Tensor)): - all_frames = [] - for list_of_videos in video: - temp_frames = [] - for vid in list_of_videos: - if vid.ndim == 4: - current_vid_frames = np.stack(vid, axis=0) if isinstance(vid, np.ndarray) else vid - elif vid.ndim == 5: - current_vid_frames = ( - np.concatenate(vid, axis=0) if isinstance(vid, np.ndarray) else torch.cat(vid, dim=0) - ) - temp_frames.append(current_vid_frames) - - # Process inner list. - temp_frames = ( - np.stack(temp_frames, axis=0) - if isinstance(temp_frames[0], np.ndarray) - else torch.stack(temp_frames, axis=0) - ) - all_frames.append(temp_frames) - - # Process outer list. - video = ( - np.concatenate(all_frames, axis=0) - if isinstance(all_frames[0], np.ndarray) - else torch.cat(all_frames, dim=0) - ) - - # `preprocess()` here would return a PT tensor. - video = torch.stack([self.preprocess(f) for f in video], dim=0) - - # move channels before num_frames + video = torch.stack([self.preprocess(img) for img in video], dim=0) video = video.permute(0, 2, 1, 3, 4) return video diff --git a/tests/others/test_video_processor.py b/tests/others/test_video_processor.py index 71524b35904b..40f024fc9b2b 100644 --- a/tests/others/test_video_processor.py +++ b/tests/others/test_video_processor.py @@ -140,7 +140,7 @@ def test_video_processor_pil(self, input_type): input_np = self.to_np(input).astype("float32") / 255.0 if output_type != "pil" else self.to_np(input) assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}" - @parameterized.expand(["list_4d_np", "list_list_4d_np", "list_5d_np", "5d_np"]) + @parameterized.expand(["list_4d_np", "list_5d_np", "5d_np"]) def test_video_processor_np(self, input_type): video_processor = VideoProcessor(do_resize=False, do_normalize=True) @@ -154,7 +154,7 @@ def test_video_processor_np(self, input_type): ) assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}" - @parameterized.expand(["list_4d_pt", "list_list_4d_pt", "list_5d_pt", "5d_pt"]) + @parameterized.expand(["list_4d_pt", "list_5d_pt", "5d_pt"]) def test_video_processor_pt(self, input_type): video_processor = VideoProcessor(do_resize=False, do_normalize=True) From feb561f88d87a3d2e940a094e19ec7e91eca3973 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 5 May 2024 20:23:41 +0200 Subject: [PATCH 2/7] update remove deprecate --- src/diffusers/image_processor.py | 16 +++++----------- src/diffusers/video_processor.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 660368e98c38..bff61c2d5bea 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -42,9 +42,9 @@ def is_valid_image(image): return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3) -def is_valid_image_input(images): - # check if the image input is one of the supported formats: - # it can be either a 4d pytorch tensor or numpy array, a valid image or a list of valid image +def is_valid_image_imagelist(images): + # check if the image input is one of the supported formats for image and image list: + # it can be either (1) a 4d pytorch tensor or numpy array, (2) a valid image or (3) list of valid image if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4: return True elif is_valid_image(images): @@ -514,17 +514,11 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) + # image processor only accept image or a list of images or a batch of images (4d array/tenssors) as inputs, + # while we do accept a list of 4d array/tensors, we concatenate them to a single image batch if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: - warnings.warn( - "Passing `image` as list of 4-dimensional numpy array is deprecated." - "The expected numpy array input format for multiple images are either a single 4-d array or a list of 3-d arrays." - ) image = np.concatenate(image, axis=0) if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4: - warnings.warn( - "Passing `image` as list of 4-dimensional pytorch tensor is deprecated." - "The expected pytorch tensor input format for multiple images are either a single 4-d tensor or a list of 3-d tensors." - ) image = torch.cat(image, axis=0) if isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim == 4: diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index f375a9d0ebb2..db4f42f9e392 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -18,7 +18,7 @@ import PIL import torch -from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_input +from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist class VideoProcessor(VaeImageProcessor): @@ -67,17 +67,20 @@ def preprocess_video(self, video) -> torch.Tensor: * 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height, width). """ + # video processor only accept video or a list of videos or a batch of videos (5d array/tenssors) as inputs, + # while we do accept a list of 5d array/tensors, we concatenate them to a single video batch + if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5: + video = np.concatenate(video, axis=0) + if isinstance(video, list) and isinstance(video[0], torch.Tensor) and video[0].ndim == 5: + video = torch.cat(video, axis=0) - # make sure video is either a list of 4-d array or a list of list images + # ensure the input is a list of videos. if it is a batch of videos, it is converted to a list of videos + # If it is is a single video, it is convereted to a list of videos. if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: video = list(video) - elif isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 4: + elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video): video = [video] - elif is_valid_image(video): - video = [[video]] - elif isinstance(video, list) and is_valid_image(video[0]): - video = [video] - elif isinstance(video, list) and is_valid_image_input(video[0]): + elif isinstance(video, list) and is_valid_image_imagelist(video[0]): video = video else: raise ValueError( From 2721d9c526bac1c7cb9d09ac250e4037040d76e0 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 5 May 2024 08:29:31 -1000 Subject: [PATCH 3/7] Update src/diffusers/video_processor.py --- src/diffusers/video_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index db4f42f9e392..e4f85d158074 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -75,7 +75,7 @@ def preprocess_video(self, video) -> torch.Tensor: video = torch.cat(video, axis=0) # ensure the input is a list of videos. if it is a batch of videos, it is converted to a list of videos - # If it is is a single video, it is convereted to a list of videos. + # If it is is a single video, it is convereted to a list of one video. if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: video = list(video) elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video): From 964508b12c2d5c40a6bd68c5a25c645d3ff755ce Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 5 May 2024 20:34:50 +0200 Subject: [PATCH 4/7] update --- src/diffusers/image_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index bff61c2d5bea..39e80c27baef 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -514,13 +514,15 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) - # image processor only accept image or a list of images or a batch of images (4d array/tenssors) as inputs, + # image processor only accept image or a list of images or a batch of images (4d array/tensors) as inputs, # while we do accept a list of 4d array/tensors, we concatenate them to a single image batch if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: image = np.concatenate(image, axis=0) if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4: image = torch.cat(image, axis=0) + # ensure the input is a list of images. if it is a batch of images, it is converted to a list of images + # if it is a single image, it is converted to a list of one image if isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim == 4: image = list(image) if is_valid_image(image): From d59a596fe4149583cab6acd632a03d821914e367 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 5 May 2024 08:38:21 -1000 Subject: [PATCH 5/7] Apply suggestions from code review --- src/diffusers/image_processor.py | 2 +- src/diffusers/video_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 39e80c27baef..474a2f28e916 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -514,7 +514,7 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) - # image processor only accept image or a list of images or a batch of images (4d array/tensors) as inputs, + # image processor only accepts image or a list of images or a batch of images (4d array/tensors) as inputs, # while we do accept a list of 4d array/tensors, we concatenate them to a single image batch if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: image = np.concatenate(image, axis=0) diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index e4f85d158074..4645bfe59b11 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -67,7 +67,7 @@ def preprocess_video(self, video) -> torch.Tensor: * 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height, width). """ - # video processor only accept video or a list of videos or a batch of videos (5d array/tenssors) as inputs, + # video processor only accepts video or a list of videos or a batch of videos (5d array/tensors) as inputs, # while we do accept a list of 5d array/tensors, we concatenate them to a single video batch if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5: video = np.concatenate(video, axis=0) From 90fac4fef2e4a89a2c93c2698181141c9436b5ad Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 6 May 2024 20:07:38 +0200 Subject: [PATCH 6/7] deprecate list of 5d for video and list of 4d for image + apply other feedbacks --- src/diffusers/image_processor.py | 20 ++++++++++++++++---- src/diffusers/video_processor.py | 16 +++++++++++++--- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 39e80c27baef..702c5f3e6a10 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -44,7 +44,10 @@ def is_valid_image(image): def is_valid_image_imagelist(images): # check if the image input is one of the supported formats for image and image list: - # it can be either (1) a 4d pytorch tensor or numpy array, (2) a valid image or (3) list of valid image + # it can be either one of below 3 + # (1) a 4d pytorch tensor or numpy array, + # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor + # (3) a list of valid image if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4: return True elif is_valid_image(images): @@ -514,14 +517,23 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) - # image processor only accept image or a list of images or a batch of images (4d array/tensors) as inputs, - # while we do accept a list of 4d array/tensors, we concatenate them to a single image batch if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as a list of 4d np.ndarray is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray", + FutureWarning, + ) image = np.concatenate(image, axis=0) if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as a list of 4d torch.Tensor is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor", + FutureWarning, + ) image = torch.cat(image, axis=0) - # ensure the input is a list of images. if it is a batch of images, it is converted to a list of images + # ensure the input is a list of images. + # if it is a batch of images (4d torch.Tensor or np.ndarray), it is converted to a list of images (a list of 3d torch.Tensor or np.ndarray) # if it is a single image, it is converted to a list of one image if isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim == 4: image = list(image) diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index e4f85d158074..4043ac6e5e85 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from typing import List, Union import numpy as np @@ -67,14 +68,23 @@ def preprocess_video(self, video) -> torch.Tensor: * 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height, width). """ - # video processor only accept video or a list of videos or a batch of videos (5d array/tenssors) as inputs, - # while we do accept a list of 5d array/tensors, we concatenate them to a single video batch if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5: + warnings.warn( + "Passing `video` as a list of 5d np.ndarray is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 5d np.ndarray", + FutureWarning, + ) video = np.concatenate(video, axis=0) if isinstance(video, list) and isinstance(video[0], torch.Tensor) and video[0].ndim == 5: + warnings.warn( + "Passing `video` as a list of 5d torch.Tensor is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 5d torch.Tensor", + FutureWarning, + ) video = torch.cat(video, axis=0) - # ensure the input is a list of videos. if it is a batch of videos, it is converted to a list of videos + # ensure the input is a list of videos. + # if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) # If it is is a single video, it is convereted to a list of one video. if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: video = list(video) From 02594f201fe2213d5e1aed09e5adc356a333eda5 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 6 May 2024 20:12:53 +0200 Subject: [PATCH 7/7] up --- src/diffusers/image_processor.py | 6 +++--- src/diffusers/video_processor.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 702c5f3e6a10..027691ad9f2f 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -532,9 +532,9 @@ def preprocess( ) image = torch.cat(image, axis=0) - # ensure the input is a list of images. - # if it is a batch of images (4d torch.Tensor or np.ndarray), it is converted to a list of images (a list of 3d torch.Tensor or np.ndarray) - # if it is a single image, it is converted to a list of one image + # ensure the input is a list of images: + # - if it is a batch of images (4d torch.Tensor or np.ndarray), it is converted to a list of images (a list of 3d torch.Tensor or np.ndarray) + # - if it is a single image, it is converted to a list of one image if isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim == 4: image = list(image) if is_valid_image(image): diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index 4043ac6e5e85..c03736c6398f 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -83,9 +83,9 @@ def preprocess_video(self, video) -> torch.Tensor: ) video = torch.cat(video, axis=0) - # ensure the input is a list of videos. - # if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) - # If it is is a single video, it is convereted to a list of one video. + # ensure the input is a list of videos: + # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) + # - if it is is a single video, it is convereted to a list of one video. if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: video = list(video) elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):