From 6430b80a9a116f9dc62e5f3760d4c636ea507390 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 10 Jan 2025 16:27:26 +0530
Subject: [PATCH 1/8] Update hunyuan_video.md to rectify the checkpoint id

---
 docs/source/en/api/pipelines/hunyuan_video.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index df43c7f8568d..bbfccb2a8c35 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -45,14 +45,14 @@ from diffusers.utils import export_to_video
 
 quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
 transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained(
-    "tencent/HunyuanVideo",
+    "hunyuanvideo-community/HunyuanVideo",
     subfolder="transformer",
     quantization_config=quant_config,
     torch_dtype=torch.float16,
 )
 
 pipeline = HunyuanVideoPipeline.from_pretrained(
-    "tencent/HunyuanVideo",
+    "hunyuanvideo-community/HunyuanVideo",
     transformer=transformer_8bit,
     torch_dtype=torch.float16,
     device_map="balanced",

From 8ca260dc1b86faf20eb6009ef6b7d95c4bdca4da Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Jan 2025 20:12:14 +0530
Subject: [PATCH 2/8] bfloat16

---
 docs/source/en/api/pipelines/hunyuan_video.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index bbfccb2a8c35..25828a2d05c4 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -48,7 +48,7 @@ transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained(
     "hunyuanvideo-community/HunyuanVideo",
     subfolder="transformer",
     quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    torch_dtype=torch.bfloat16,
 )
 
 pipeline = HunyuanVideoPipeline.from_pretrained(

From 86573404935ef1abd948516ba5a2122785de1faf Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:26:09 +0530
Subject: [PATCH 3/8] more fixes

---
 docs/source/en/api/models/autoencoder_kl_hunyuan_video.md   | 2 +-
 docs/source/en/api/pipelines/hunyuan_video.md               | 4 ++--
 docs/source/en/using-diffusers/text-img2vid.md              | 6 +++---
 .../models/transformers/transformer_hunyuan_video.py        | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
index 33dff5b903cd..0d43bab2e283 100644
--- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
+++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # AutoencoderKLHunyuanVideo
 
-The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
+The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/hunyuanvideo-community/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
 
 The model can be loaded with the following code snippet.
 
diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index 25828a2d05c4..ec640526a17d 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -16,7 +16,7 @@
 
 [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.
 
-*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).*
+*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/hunyuanvideo-community/HunyuanVideo).*
 
 <Tip>
 
@@ -30,7 +30,7 @@ Recommendations for inference:
 - VAE should be in `torch.float16`.
 - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`.
 - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo.
-- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/).
+- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/hunyuanvideo-community/HunyuanVideo/).
 
 ## Quantization
 
diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md
index 7b27a258f247..b02c8b723803 100644
--- a/docs/source/en/using-diffusers/text-img2vid.md
+++ b/docs/source/en/using-diffusers/text-img2vid.md
@@ -70,7 +70,7 @@ export_to_video(video, "output.mp4", fps=8)
 > [!TIP]
 > HunyuanVideo is a 13B parameter model and requires a lot of memory. Refer to the HunyuanVideo [Quantization](../api/pipelines/hunyuan_video#quantization) guide to learn how to quantize the model. CogVideoX and LTX-Video are more lightweight options that can still generate high-quality videos.
 
-[HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE.
+[HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE.
 
 ```py
 import torch
@@ -78,10 +78,10 @@ from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
 from diffusers.utils import export_to_video
 
 transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-    "tencent/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16
+    "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16
 )
 pipe = HunyuanVideoPipeline.from_pretrained(
-  "tencent/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16
+  "hunyuanvideo-community/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16
 )
 
 # reduce memory requirements
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index 044f2048775f..910d24c8b0f9 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -504,7 +504,8 @@ def forward(
 
 class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
-    A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
+    A Transformer model for video-like data used in
+    [HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
 
     Args:
         in_channels (`int`, defaults to `16`):

From b07d5883a2f042ed44b8a1dcb7ff5046ba6751c4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:48:01 +0530
Subject: [PATCH 4/8] don't update the checkpoint ids.

---
 docs/source/en/api/models/autoencoder_kl_hunyuan_video.md | 2 +-
 docs/source/en/api/pipelines/hunyuan_video.md             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
index 0d43bab2e283..7b96bef49480 100644
--- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
+++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # AutoencoderKLHunyuanVideo
 
-The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/hunyuanvideo-community/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
+The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
 
 The model can be loaded with the following code snippet.
 
diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index ec640526a17d..6327f8cb7344 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -16,7 +16,7 @@
 
 [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.
 
-*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/hunyuanvideo-community/HunyuanVideo).*
+*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/tencent/HunyuanVideo).*
 
 <Tip>
 

From 9c7969cf22c810733adb3677cd154e6680237016 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:49:29 +0530
Subject: [PATCH 5/8] update

---
 docs/source/en/api/pipelines/hunyuan_video.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index 6327f8cb7344..a3b00a3c6835 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -30,7 +30,7 @@ Recommendations for inference:
 - VAE should be in `torch.float16`.
 - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`.
 - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo.
-- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/hunyuanvideo-community/HunyuanVideo/).
+- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/tencent/HunyuanVideo/).
 
 ## Quantization
 

From fa15cca4e074724e3ca0aaeab18d80ec1fdb7a81 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:51:16 +0530
Subject: [PATCH 6/8] t -> T

---
 docs/source/en/api/models/autoencoder_kl_hunyuan_video.md | 2 +-
 docs/source/en/api/pipelines/hunyuan_video.md             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
index 7b96bef49480..33dff5b903cd 100644
--- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
+++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # AutoencoderKLHunyuanVideo
 
-The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
+The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
 
 The model can be loaded with the following code snippet.
 
diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
index a3b00a3c6835..5148a97b754a 100644
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -30,7 +30,7 @@ Recommendations for inference:
 - VAE should be in `torch.float16`.
 - `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`.
 - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo.
-- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/tencent/HunyuanVideo/).
+- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/).
 
 ## Quantization
 

From a7841141721dfb2bfac48db19f7b365ad7665d8b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:55:07 +0530
Subject: [PATCH 7/8] Apply suggestions from code review

---
 docs/source/en/using-diffusers/text-img2vid.md                 | 2 +-
 src/diffusers/models/transformers/transformer_hunyuan_video.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md
index b02c8b723803..92e740bb579d 100644
--- a/docs/source/en/using-diffusers/text-img2vid.md
+++ b/docs/source/en/using-diffusers/text-img2vid.md
@@ -70,7 +70,7 @@ export_to_video(video, "output.mp4", fps=8)
 > [!TIP]
 > HunyuanVideo is a 13B parameter model and requires a lot of memory. Refer to the HunyuanVideo [Quantization](../api/pipelines/hunyuan_video#quantization) guide to learn how to quantize the model. CogVideoX and LTX-Video are more lightweight options that can still generate high-quality videos.
 
-[HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE.
+[HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE.
 
 ```py
 import torch
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index 910d24c8b0f9..f9069411fbdb 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -505,7 +505,7 @@ def forward(
 class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     A Transformer model for video-like data used in
-    [HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
+    [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
 
     Args:
         in_channels (`int`, defaults to `16`):

From 99f86dfb2f509946a6a0c73ba25296d0af61a116 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 11 Jan 2025 08:55:32 +0530
Subject: [PATCH 8/8] fix

---
 src/diffusers/models/transformers/transformer_hunyuan_video.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index f9069411fbdb..044f2048775f 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -504,8 +504,7 @@ def forward(
 
 class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
-    A Transformer model for video-like data used in
-    [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
+    A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
 
     Args:
         in_channels (`int`, defaults to `16`):