From 6430b80a9a116f9dc62e5f3760d4c636ea507390 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 10 Jan 2025 16:27:26 +0530 Subject: [PATCH 1/8] Update hunyuan_video.md to rectify the checkpoint id --- docs/source/en/api/pipelines/hunyuan_video.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index df43c7f8568d..bbfccb2a8c35 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -45,14 +45,14 @@ from diffusers.utils import export_to_video quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True) transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained( - "tencent/HunyuanVideo", + "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", quantization_config=quant_config, torch_dtype=torch.float16, ) pipeline = HunyuanVideoPipeline.from_pretrained( - "tencent/HunyuanVideo", + "hunyuanvideo-community/HunyuanVideo", transformer=transformer_8bit, torch_dtype=torch.float16, device_map="balanced", From 8ca260dc1b86faf20eb6009ef6b7d95c4bdca4da Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 10 Jan 2025 20:12:14 +0530 Subject: [PATCH 2/8] bfloat16 --- docs/source/en/api/pipelines/hunyuan_video.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index bbfccb2a8c35..25828a2d05c4 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -48,7 +48,7 @@ transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained( "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", quantization_config=quant_config, - torch_dtype=torch.float16, + torch_dtype=torch.bfloat16, ) pipeline = HunyuanVideoPipeline.from_pretrained( From 86573404935ef1abd948516ba5a2122785de1faf Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 11 Jan 2025 08:26:09 +0530 Subject: [PATCH 3/8] more fixes --- docs/source/en/api/models/autoencoder_kl_hunyuan_video.md | 2 +- docs/source/en/api/pipelines/hunyuan_video.md | 4 ++-- docs/source/en/using-diffusers/text-img2vid.md | 6 +++--- .../models/transformers/transformer_hunyuan_video.py | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md index 33dff5b903cd..0d43bab2e283 100644 --- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md +++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md @@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. --> # AutoencoderKLHunyuanVideo -The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. +The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/hunyuanvideo-community/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. The model can be loaded with the following code snippet. diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index 25828a2d05c4..ec640526a17d 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -16,7 +16,7 @@ [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent. -*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).* +*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/hunyuanvideo-community/HunyuanVideo).* @@ -30,7 +30,7 @@ Recommendations for inference: - VAE should be in `torch.float16`. - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`. - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo. -- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/). +- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/hunyuanvideo-community/HunyuanVideo/). ## Quantization diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md index 7b27a258f247..b02c8b723803 100644 --- a/docs/source/en/using-diffusers/text-img2vid.md +++ b/docs/source/en/using-diffusers/text-img2vid.md @@ -70,7 +70,7 @@ export_to_video(video, "output.mp4", fps=8) > [!TIP] > HunyuanVideo is a 13B parameter model and requires a lot of memory. Refer to the HunyuanVideo [Quantization](../api/pipelines/hunyuan_video#quantization) guide to learn how to quantize the model. CogVideoX and LTX-Video are more lightweight options that can still generate high-quality videos. -[HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. +[HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. ```py import torch @@ -78,10 +78,10 @@ from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel from diffusers.utils import export_to_video transformer = HunyuanVideoTransformer3DModel.from_pretrained( - "tencent/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16 + "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16 ) pipe = HunyuanVideoPipeline.from_pretrained( - "tencent/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16 + "hunyuanvideo-community/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16 ) # reduce memory requirements diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 044f2048775f..910d24c8b0f9 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -504,7 +504,8 @@ def forward( class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): r""" - A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo). + A Transformer model for video-like data used in + [HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). Args: in_channels (`int`, defaults to `16`): From b07d5883a2f042ed44b8a1dcb7ff5046ba6751c4 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 11 Jan 2025 08:48:01 +0530 Subject: [PATCH 4/8] don't update the checkpoint ids. --- docs/source/en/api/models/autoencoder_kl_hunyuan_video.md | 2 +- docs/source/en/api/pipelines/hunyuan_video.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md index 0d43bab2e283..7b96bef49480 100644 --- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md +++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md @@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. --> # AutoencoderKLHunyuanVideo -The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/hunyuanvideo-community/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. +The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. The model can be loaded with the following code snippet. diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index ec640526a17d..6327f8cb7344 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -16,7 +16,7 @@ [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent. -*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/hunyuanvideo-community/HunyuanVideo).* +*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/tencent/HunyuanVideo).* From 9c7969cf22c810733adb3677cd154e6680237016 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 11 Jan 2025 08:49:29 +0530 Subject: [PATCH 5/8] update --- docs/source/en/api/pipelines/hunyuan_video.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index 6327f8cb7344..a3b00a3c6835 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -30,7 +30,7 @@ Recommendations for inference: - VAE should be in `torch.float16`. - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`. - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo. -- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/hunyuanvideo-community/HunyuanVideo/). +- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/tencent/HunyuanVideo/). ## Quantization From fa15cca4e074724e3ca0aaeab18d80ec1fdb7a81 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 11 Jan 2025 08:51:16 +0530 Subject: [PATCH 6/8] t -> T --- docs/source/en/api/models/autoencoder_kl_hunyuan_video.md | 2 +- docs/source/en/api/pipelines/hunyuan_video.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md index 7b96bef49480..33dff5b903cd 100644 --- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md +++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md @@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. --> # AutoencoderKLHunyuanVideo -The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. +The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent. The model can be loaded with the following code snippet. diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index a3b00a3c6835..5148a97b754a 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -30,7 +30,7 @@ Recommendations for inference: - VAE should be in `torch.float16`. - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`. - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo. -- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/tencent/HunyuanVideo/). +- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/). ## Quantization From a7841141721dfb2bfac48db19f7b365ad7665d8b Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sat, 11 Jan 2025 08:55:07 +0530 Subject: [PATCH 7/8] Apply suggestions from code review --- docs/source/en/using-diffusers/text-img2vid.md | 2 +- src/diffusers/models/transformers/transformer_hunyuan_video.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md index b02c8b723803..92e740bb579d 100644 --- a/docs/source/en/using-diffusers/text-img2vid.md +++ b/docs/source/en/using-diffusers/text-img2vid.md @@ -70,7 +70,7 @@ export_to_video(video, "output.mp4", fps=8) > [!TIP] > HunyuanVideo is a 13B parameter model and requires a lot of memory. Refer to the HunyuanVideo [Quantization](../api/pipelines/hunyuan_video#quantization) guide to learn how to quantize the model. CogVideoX and LTX-Video are more lightweight options that can still generate high-quality videos. -[HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. +[HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. ```py import torch diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 910d24c8b0f9..f9069411fbdb 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -505,7 +505,7 @@ def forward( class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): r""" A Transformer model for video-like data used in - [HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). + [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo). Args: in_channels (`int`, defaults to `16`): From 99f86dfb2f509946a6a0c73ba25296d0af61a116 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 11 Jan 2025 08:55:32 +0530 Subject: [PATCH 8/8] fix --- src/diffusers/models/transformers/transformer_hunyuan_video.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index f9069411fbdb..044f2048775f 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -504,8 +504,7 @@ def forward( class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): r""" - A Transformer model for video-like data used in - [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo). + A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo). Args: in_channels (`int`, defaults to `16`):