From f91f6bd1ef954eef1f8fadea995b42e3db1a39a3 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 4 Apr 2023 09:06:38 +0530 Subject: [PATCH 1/2] fix: norm group test for UNet3D. --- tests/models/test_models_unet_3d_condition.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 729367a0c164..5a0d74a3ea5a 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -119,12 +119,11 @@ def test_xformers_enable_works(self): == "XFormersAttnProcessor" ), "xformers is not enabled" - # Overriding because `block_out_channels` needs to be different for this model. + # Overriding to set `norm_num_groups` needs to be different for this model. def test_forward_with_norm_groups(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() init_dict["norm_num_groups"] = 32 - init_dict["block_out_channels"] = (32, 64, 64, 64) model = self.model_class(**init_dict) model.to(torch_device) From ee1016cd360423bab25d02b1e6b9787593519912 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 11 Apr 2023 09:44:44 +0530 Subject: [PATCH 2/2] refactor text-to-video zero docs. --- docs/source/en/api/pipelines/text_to_video_zero.mdx | 9 +++++++-- .../pipeline_text_to_video_zero.py | 5 ++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video_zero.mdx b/docs/source/en/api/pipelines/text_to_video_zero.mdx index 86653ae1019b..3ee10f01c377 100644 --- a/docs/source/en/api/pipelines/text_to_video_zero.mdx +++ b/docs/source/en/api/pipelines/text_to_video_zero.mdx @@ -61,6 +61,7 @@ Resources: To generate a video from prompt, run the following python command ```python import torch +import imageio from diffusers import TextToVideoZeroPipeline model_id = "runwayml/stable-diffusion-v1-5" @@ -68,6 +69,7 @@ pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float prompt = "A panda is playing guitar on times square" result = pipe(prompt=prompt).images +result = [(r * 255).astype("uint8") for r in result] imageio.mimsave("video.mp4", result, fps=4) ``` You can change these parameters in the pipeline call: @@ -95,6 +97,7 @@ To generate a video from prompt with additional pose control 2. Read video containing extracted pose images ```python + from PIL import Image import imageio reader = imageio.get_reader(video_path, "ffmpeg") @@ -151,6 +154,7 @@ To perform text-guided video editing (with [InstructPix2Pix](./stable_diffusion/ 2. Read video from path ```python + from PIL import Image import imageio reader = imageio.get_reader(video_path, "ffmpeg") @@ -174,14 +178,14 @@ To perform text-guided video editing (with [InstructPix2Pix](./stable_diffusion/ ``` -### Dreambooth specialization +### DreamBooth specialization Methods **Text-To-Video**, **Text-To-Video with Pose Control** and **Text-To-Video with Edge Control** can run with custom [DreamBooth](../training/dreambooth) models, as shown below for [Canny edge ControlNet model](https://huggingface.co/lllyasviel/sd-controlnet-canny) and [Avatar style DreamBooth](https://huggingface.co/PAIR/text2video-zero-controlnet-canny-avatar) model -1. Download demo video from huggingface +1. Download a demo video ```python from huggingface_hub import hf_hub_download @@ -193,6 +197,7 @@ can run with custom [DreamBooth](../training/dreambooth) models, as shown below 2. Read video from path ```python + from PIL import Image import imageio reader = imageio.get_reader(video_path, "ffmpeg") diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 6cf4b8544b01..35e3ae6a6d6c 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -374,9 +374,8 @@ def __call__( Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + output_type (`str`, *optional*, defaults to `"numpy"`): + The output format of the generated image. Choose between `"latent"` and `"numpy"`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple.