---

📌 **This notebook has been updated in [jhj0517/finetuning-notebooks](https://github.com/jhj0517/finetuning-notebooks) repository!**

## Version : 1.0.0
---

In [None]:
#@title #(Optional) Check GPU

#@markdown To train Hunyuan Video lora 24GB VRAM is recommended.
#@markdown <br>You can check your GPU setup before start.
!nvidia-smi

In [None]:
#@title #1. Install Dependencies
#@markdown This notebook is powered by https://github.com/tdrussell/diffusion-pipe
!git clone --recurse-submodules https://github.com/tdrussell/diffusion-pipe
%cd diffusion-pipe

!pip install deepspeed
!pip install datasets
!pip install torch-optimi
!pip install bitsandbytes
!pip install av
!pip install loguru
!pip install flash-attn

In [None]:
#@title # 2. (Optional) Mount Google Drive

#@markdown It's not mandatory but it's recommended to mount to Google Drive and use the Google Drive's path for your training dataset.

#@markdown The dataset should have following structure.

#@markdown Each video file should have a corresponding text file (`.txt`) with the same name. <br>
#@markdown **Each video must have a precious number of frames, as much as you will define later, and 24 frames per second.**

#@markdown The text file typically contains prompts associated with the video.


#@markdown ### Example Dataset Structure:
#@markdown ```
#@markdown your-dataset/
#@markdown ├── a (1).mp4         # Video file
#@markdown ├── a (1).txt         # Corresponding prompt for a (1).mp4
#@markdown ├── a (2).mp4         # Another video file
#@markdown ├── a (2).txt         # Corresponding prompt for a (2).mp4
#@markdown ```

from google.colab import drive
import os
drive.mount('/content/drive')

In [None]:
#@title # 3. (Optional) Register Huggingface Token To Download Base Model

#@markdown This cell will download base models. If you don't already have the base model files in your google drive, run this.

#@markdown You need Huggingface token (Read permission) to run this.

#@markdown Get your tokens from https://huggingface.co/settings/tokens, and register in colab's seceret as **`HF_TOKEN`** and use it in any notebook. ( 'Read' permission is enough )

#@markdown To register secrets in colab, click on the key-shaped icon in the left panel and enter your **`HF_TOKEN`** like this:

#@markdown ![image](https://miro.medium.com/v2/resize:fit:720/format:webp/1*kqKOGVkupS_R2FQ_049xLw.png))


#@markdown models will be downloaded from
#@markdown - hunyuan: https://huggingface.co/Kijai/HunyuanVideo_comfy/tree/main
#@markdown - clip: https://huggingface.co/openai/clip-vit-large-patch14
#@markdown - llm: https://huggingface.co/Kijai/llava-llama-3-8b-text-encoder-tokenizer

import huggingface_hub

# Set params
BASE_MODELS_DIR_PATH = "/content/drive/MyDrive/finetunings/hunyuan/base_models" # @param {type:"string"}
HUNYUAN_VIDEO = "hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors" #@param ["hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors", "hunyuan_video_720_cfgdistill_bf16.safetensors", "hunyuan_video_FastVideo_720_fp8_e4m3fn.safetensors"]
VAE = "hunyuan_video_vae_bf16.safetensors" #@param ["hunyuan_video_vae_bf16.safetensors", "hunyuan_video_vae_fp32.safetensors"]
CLIP = "openai/clip-vit-large-patch14" #@param ["openai/clip-vit-large-patch14"]
LLM = "Kijai/llava-llama-3-8b-text-encoder-tokenizer" #@param ["Kijai/llava-llama-3-8b-text-encoder-tokenizer"]

# Initialize dir paths
HUNYUAN_MODELS_DIR = os.path.join(BASE_MODELS_DIR_PATH, "hunyuan")
os.makedirs(HUNYUAN_MODELS_DIR, exist_ok=True)

CLIP_MODEL_DIR = os.path.join(BASE_MODELS_DIR_PATH, "clip")
os.makedirs(CLIP_MODEL_DIR, exist_ok=True)

LLM_MODEL_DIR = os.path.join(BASE_MODELS_DIR_PATH, "llm")
os.makedirs(LLM_MODEL_DIR, exist_ok=True)

HUNYUAN_VIDEO_URL = {
    "hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors": "https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors",
    "hunyuan_video_720_cfgdistill_bf16.safetensors": "https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/hunyuan_video_720_cfgdistill_bf16.safetensors",
    "hunyuan_video_FastVideo_720_fp8_e4m3fn.safetensors": "https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/hunyuan_video_FastVideo_720_fp8_e4m3fn.safetensors",
}

HUNYUAN_VIDEO_VAE_URL = {
    "hunyuan_video_vae_bf16.safetensors": "https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/hunyuan_video_vae_bf16.safetensors",
    "hunyuan_video_vae_fp32.safetensors": "https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/hunyuan_video_vae_fp32.safetensors",
}

# Download models
video_url, vae_url = HUNYUAN_VIDEO_URL[HUNYUAN_VIDEO], HUNYUAN_VIDEO_VAE_URL[VAE]
clip_repo_id, llm_repo_id = CLIP, LLM

!wget {video_url} -P {HUNYUAN_MODELS_DIR}
!wget {vae_url} -P {HUNYUAN_MODELS_DIR}

huggingface_hub.snapshot_download(
    clip_repo_id,
    local_dir = CLIP_MODEL_DIR,
)
huggingface_hub.snapshot_download(
    llm_repo_id,
    local_dir = LLM_MODEL_DIR,
)


In [None]:
#@title # 4. Train with Parameters
import os
import sys
import toml

#@markdown If you're intended to train Lora from checkpoint, check this.
RESUME_FROM_CHECKPOINT = False #@param {type:"boolean"}

#@markdown ## Paths Configuration
OUTPUT_LORA_NAME = "My-Hunyuan-Lora-V1" # @param {type:"string"}
OUTPUT_LORA_DIR_PATH = "/content/drive/MyDrive/finetunings/outputs"  # @param {type:"string"}
BASE_MODELS_DIR_PATH = "/content/drive/MyDrive/finetunings/hunyuan/base_models" # @param {type:"string"}
DATASET_PATH = "/content/drive/MyDrive/finetunings/dataset/dog" # @param {type:"string"}


OUTPUT_LORA_DIR_PATH = os.path.join(OUTPUT_LORA_DIR_PATH, OUTPUT_LORA_NAME)
transformer_dir_path = os.path.join(BASE_MODELS_DIR_PATH, "hunyuan")
vae_dir_path = os.path.join(BASE_MODELS_DIR_PATH, "hunyuan")
clip_path = os.path.join(BASE_MODELS_DIR_PATH, "clip")
llm_path = os.path.join(BASE_MODELS_DIR_PATH, "llm")

#@markdown ## Dataset Configuration
#@markdown - **`frame_bucket`** is the list of frame numbers in your dataset.
#@markdown <br>For example, if your dataset contains 30, 60 frames of videos, then use : [30, 60]
#@markdown <br>If your dataset also contains images, then use : [1, 30, 60]
#@markdown - **`resolution`** is the resolution which **`diffusion-pipe`** will resize your dataset.
#@markdown <br>**`diffusion-pipe`** is smart to handle resizing your dataset by 1:2 or 2:1 image etc.
#@markdown <br>If you have less than 24GB of VRAM, just set it to 512, then increase it according to your device.
## Frame Buckets Settings
frame_buckets = [1]  # @param {type:"raw"}
# You can use 1024 if you have 24 GB > VRAM.
resolutions = [512]  # @param {type:"raw"}
## Aspect Ratio Bucketing Settings
enable_ar_bucket = True  # @param {type:"boolean"}
min_ar = 0.5  # @param {type:"number"}
max_ar = 2.0  # @param {type:"number"}
num_ar_buckets = 7  # @param {type:"integer"}
# Reduce as necessary
num_repeats = 5

# Write dataset.toml
dataset_config = {
    "resolutions": resolutions,
    "frame_buckets": frame_buckets,

    "enable_ar_bucket": enable_ar_bucket,
    "min_ar": min_ar,
    "max_ar": max_ar,
    "num_ar_buckets": num_ar_buckets,
    "directory": [
        {
            "path": DATASET_PATH,
            "num_repeats": num_repeats,
        }
    ],
}

os.makedirs(OUTPUT_LORA_DIR_PATH, exist_ok=True)
os.makedirs(DATASET_PATH, exist_ok=True)
dataset_config_file_path = os.path.join(DATASET_PATH, "dataset.toml")
with open(dataset_config_file_path, "w") as toml_file:
    toml.dump(dataset_config, toml_file)
print(f"dataset.toml is saved to {dataset_config_file_path}")

#@markdown ## Base Model Configuration
BASE_HUNYUAN_MODEL_NAME = "hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors" #@param ["hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors", "hunyuan_video_720_cfgdistill_bf16.safetensors", "hunyuan_video_FastVideo_720_fp8_e4m3fn.safetensors"]
BASE_VAE_MODEL_NAME = "hunyuan_video_vae_bf16.safetensors" #@param ["hunyuan_video_vae_bf16.safetensors", "hunyuan_video_vae_fp32.safetensors"]

transformer_path = os.path.join(transformer_dir_path, BASE_HUNYUAN_MODEL_NAME)
vae_path = os.path.join(vae_dir_path, BASE_VAE_MODEL_NAME)

model_type = 'hunyuan-video'
dtype = 'bfloat16'  # @param {type:"string"}
transformer_dtype = 'float8'  # @param {type:"string"}
timestep_sample_method = 'logit_normal'  # @param {type:"string"}

#@markdown ## Training Settings
epochs = 50  # @param {type:"integer"}
micro_batch_size_per_gpu = 1  # @param {type:"integer"}
pipeline_stages = 1  # @param {type:"integer"}
gradient_accumulation_steps = 4  # @param {type:"integer"}
gradient_clipping = 1.0  # @param {type:"number"}
warmup_steps = 100  # @param {type:"integer"}

#@markdown ## Eval Settings
eval_every_n_epochs = 5  # @param {type:"integer"}
eval_before_first_step = True  # @param {type:"boolean"}
eval_micro_batch_size_per_gpu = 1  # @param {type:"integer"}
eval_gradient_accumulation_steps = 1  # @param {type:"integer"}

#@markdown ## Lora Settings
adapter_type = 'lora'
rank = 64  # @param {type:"integer"}
adapter_dtype = 'bfloat16'  # @param {type:"string"}

# Optimizer settings
optimizer_type = 'adamw_optimi'  # @param {type:"string"}
lr = 5e-5  # @param {type:"number"}
betas = [0.9, 0.99]  # @param {type:"raw"}
weight_decay = 0.02  # @param {type:"number"}
eps = 1e-8  # @param {type:"number"}

#@markdown ## Misc Settings
save_every_n_epochs = 5  # @param {type:"integer"}
checkpoint_every_n_minutes = 30  # @param {type:"integer"}
activation_checkpointing = True  # @param {type:"boolean"}
partition_method = 'parameters'  # @param {type:"string"}
save_dtype = 'bfloat16'  # @param {type:"string"}
caching_batch_size = 1  # @param {type:"integer"}
steps_per_print = 1  # @param {type:"integer"}
video_clip_mode = 'single_middle'  # @param {type:"string"}


# Write config.toml
train_config = {
    "output_dir": OUTPUT_LORA_DIR_PATH,
    "dataset": dataset_config_file_path,

    # Training Settings
    "epochs": epochs,
    "micro_batch_size_per_gpu": micro_batch_size_per_gpu,
    "pipeline_stages": pipeline_stages,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "gradient_clipping": gradient_clipping,
    "warmup_steps": warmup_steps,

    # Eval Settings
    "eval_every_n_epochs": eval_every_n_epochs,
    "eval_before_first_step": eval_before_first_step,
    "eval_micro_batch_size_per_gpu": eval_micro_batch_size_per_gpu,
    "eval_gradient_accumulation_steps": eval_gradient_accumulation_steps,

    # Misc Settings
    "save_every_n_epochs": save_every_n_epochs,
    "checkpoint_every_n_minutes": checkpoint_every_n_minutes,
    "activation_checkpointing": activation_checkpointing,
    "partition_method": partition_method,
    "save_dtype": save_dtype,
    "caching_batch_size": caching_batch_size,
    "steps_per_print": steps_per_print,
    "video_clip_mode": video_clip_mode,

    "model": {
        "type": model_type,
        "transformer_path": transformer_path,
        "vae_path": vae_path,
        "llm_path": llm_path,
        "clip_path": clip_path,
        "dtype": dtype,
        "transformer_dtype": transformer_dtype,
        "timestep_sample_method": timestep_sample_method,
    },

    "adapter": {
        "type": "lora",
        "rank": rank,
        "dtype": adapter_dtype,
    },

    "optimizer": {
        "type": optimizer_type,
        "lr": lr,
        "betas": betas,
        "weight_decay": weight_decay,
        "eps": eps,
    },


}

train_config_file_path = os.path.join(DATASET_PATH, "config.toml")
with open(train_config_file_path, "w") as toml_file:
    toml.dump(train_config, toml_file)
print(f"config.toml is saved to {train_config_file_path}")


## Train
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

if RESUME_FROM_CHECKPOINT:
  !deepspeed --num_gpus=1 train.py --deepspeed --config {train_config_file_path} --resume_from_checkpoint
else:
  !deepspeed --num_gpus=1 train.py --deepspeed --config {train_config_file_path}
