From 5dba7d4899ae7e57fe451338521fce041030ea90 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 18 Nov 2025 03:47:58 -0500 Subject: [PATCH 1/6] draft Signed-off-by: Mengni Wang --- .../diffusers/framepack/README.md | 54 +++ .../diffusers/framepack/main.py | 344 ++++++++++++++++++ .../diffusers/framepack/requirements.txt | 6 + .../diffusers/framepack/run_benchmark.sh | 110 ++++++ 4 files changed, 514 insertions(+) create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/README.md create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/main.py create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/README.md b/examples/pytorch/diffusion_model/diffusers/framepack/README.md new file mode 100644 index 00000000000..496f8f3d4f7 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/README.md @@ -0,0 +1,54 @@ +# Step-by-Step + +This example quantizes and validates the accuracy of Flux. + +# Prerequisite + +## 1. Environment + +```shell +pip install -r requirements.txt +pip install neural-compressor-pt==3.6 +pip install auto-round==0.8.0 +git clone https://github.com/Vchitect/VBench.git +cd VBench +pip install -r requirements.txt +pip install vbench +cd .. +git clone https://github.com/lllyasviel/FramePack.git +cd FramePack +pip install -r requirements.txt +cd .. +``` + +## 2. Prepare Dataset + +```shell +cd VBench +sh vbench2_beta_i2v/download_data.sh +``` + +# Run + +## BF16 + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +bash run_benchmark.sh \ + --dataset_location=/path/to/VBench \ + --output_video_path=bf16_video \ + --dimension_list=subject_consistency i2v_background \ +``` + +## MXFP8 or FP8 + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +bash run_benchmark.sh \ + --scheme=MXFP8 \ # or FP8 + --dataset_location=/path/to/VBench \ + --output_video_path=mxfp8_video \ + --dimension_list=subject_consistency i2v_background \ +``` + +- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py new file mode 100644 index 00000000000..3eb70e91abf --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py @@ -0,0 +1,344 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import argparse + +import torch + +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, +) +from PIL import Image +from diffusers import AutoencoderKLHunyuanVideo +from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer +from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake +from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp +from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked +from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan +from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete +from transformers import SiglipImageProcessor, SiglipVisionModel +from diffusers_helper.clip_vision import hf_clip_vision_encode +from diffusers_helper.bucket_tools import find_nearest_bucket +import torch +from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked +from auto_round import AutoRound +import json +import torchvision +import torch +import einops +import numpy as np +import argparse + + +parser = argparse.ArgumentParser( + description="FramePack quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument("--scheme", default="MXFP8", type=str, help="quantizaion scheme.") +parser.add_argument("--quantize", action="store_true") +parser.add_argument("--inference", action="store_true") +parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model") +parser.add_argument("--dataset_location", type=str, help="Path of cloned VBench repository which contains images and prompts for evaluation") +parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="the directory to save generated videos") +parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation") +parser.add_argument("--seed", default=31337, type=int, help="random seed") +parser.add_argument("--total_second_length", default=5, type=int, help="length of generated video") +parser.add_argument("--steps", default=25, type=float) +parser.add_argument("--cfg", default=1.0, type=float) +parser.add_argument("--gs", default=10.0, type=float) +parser.add_argument("--rs", default=0.0, type=float) +parser.add_argument("--gpu_memory_preservation", default=6, type=int) +parser.add_argument("--use_teacache", action="store_true") +parser.add_argument("--mp4_crf", default=16, type=int) +parser.add_argument( + "--dimension_list", + nargs="+", + choices=["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "i2v_subject", "i2v_background", "camera_motion"], + help="list of evaluation dimensions, usage: --dimension_list ", +) +parser.add_argument("--limit", default=-1, type=int) +parser.add_argument("--ratio", default="16-9", type=str) + +args = parser.parse_args() +free_mem_gb = get_cuda_free_memory_gb(gpu) +high_vram = free_mem_gb > 60 + +@torch.no_grad() +def worker(input_image, prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): + input_image = Image.open(input_image).convert("RGB") + input_image = np.array(input_image) + total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) + total_latent_sections = int(max(round(total_latent_sections), 1)) + + # Clean GPU + if not high_vram: + unload_complete_models( + text_encoder, text_encoder_2, image_encoder, vae, transformer + ) + + # Text encoding + + if not high_vram: + fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. + load_model_as_complete(text_encoder_2, target_device=gpu) + + llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) + + llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler) + + llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512) + llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512) + # Processing input image + + H, W, C = input_image.shape + height, width = find_nearest_bucket(H, W, resolution=640) + input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) + + input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1 + input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None] + + # VAE encoding + + if not high_vram: + load_model_as_complete(vae, target_device=gpu) + + start_latent = vae_encode(input_image_pt, vae) + + # CLIP Vision + + if not high_vram: + load_model_as_complete(image_encoder, target_device=gpu) + + image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder) + image_encoder_last_hidden_state = image_encoder_output.last_hidden_state + + # Dtype + + llama_vec = llama_vec.to(transformer.dtype) + llama_vec_n = llama_vec_n.to(transformer.dtype) + clip_l_pooler = clip_l_pooler.to(transformer.dtype) + clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype) + image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype) + + # Sampling + + rnd = torch.Generator("cpu").manual_seed(seed) + num_frames = latent_window_size * 4 - 3 + + history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu() + history_pixels = None + total_generated_latent_frames = 0 + + latent_paddings = reversed(range(total_latent_sections)) + + if total_latent_sections > 4: + # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some + # items looks better than expanding it when total_latent_sections > 4 + # One can try to remove below trick and just + # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare + latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0] + + for latent_padding in latent_paddings: + is_last_section = latent_padding == 0 + latent_padding_size = latent_padding * latent_window_size + + print(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}") + + indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0) + clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1) + clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1) + + clean_latents_pre = start_latent.to(history_latents) + clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2) + clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) + + if not high_vram: + unload_complete_models() + move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation) + + + if use_teacache: + transformer.initialize_teacache(enable_teacache=True, num_steps=steps) + else: + transformer.initialize_teacache(enable_teacache=False) + + def callback(d): + preview = d["denoised"] + preview = vae_decode_fake(preview) + + preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8) + preview = einops.rearrange(preview, "b c t h w -> (b h) (t w) c") + + current_step = d["i"] + 1 + hint = f"Sampling {current_step}/{steps}" + desc = f"Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ..." + print(hint, desc) + return + + generated_latents = sample_hunyuan( + transformer=transformer, + sampler="unipc", + width=width, + height=height, + frames=num_frames, + real_guidance_scale=cfg, + distilled_guidance_scale=gs, + guidance_rescale=rs, + # shift=3.0, + num_inference_steps=steps, + generator=rnd, + prompt_embeds=llama_vec, + prompt_embeds_mask=llama_attention_mask, + prompt_poolers=clip_l_pooler, + negative_prompt_embeds=llama_vec_n, + negative_prompt_embeds_mask=llama_attention_mask_n, + negative_prompt_poolers=clip_l_pooler_n, + device=gpu, + dtype=torch.bfloat16, + image_embeddings=image_encoder_last_hidden_state, + latent_indices=latent_indices, + clean_latents=clean_latents, + clean_latent_indices=clean_latent_indices, + clean_latents_2x=clean_latents_2x, + clean_latent_2x_indices=clean_latent_2x_indices, + clean_latents_4x=clean_latents_4x, + clean_latent_4x_indices=clean_latent_4x_indices, + callback=callback, + ) + if is_last_section: + generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2) + + total_generated_latent_frames += int(generated_latents.shape[2]) + history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2) + + if not high_vram: + offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8) + load_model_as_complete(vae, target_device=gpu) + + real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :] + + if history_pixels is None: + history_pixels = vae_decode(real_history_latents, vae).cpu() + else: + section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2) + overlapped_frames = latent_window_size * 4 - 3 + + current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu() + history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames) + + if not high_vram: + unload_complete_models() + + print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}") + + if is_last_section: + break + return history_pixels + +if __name__ == "__main__": + transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained("lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu() + if args.quantize: + print(f"Start to quantize {args.model}.") + setattr(transformer, "name_or_path", "lllyasviel/FramePackI2V_HY") + + qconfig = AutoRoundConfig( + scheme=args.scheme, + iters=0, + export_format="fake", + output_dir=args.output_dir, + ) + transformer = prepare(transformer, qconfig) + transformer = convert(transformer, qconfig) + + if args.inference: + text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder", torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder_2", torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer") + tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2") + vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="feature_extractor") + image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16).cpu() + + vae.eval() + text_encoder.eval() + text_encoder_2.eval() + image_encoder.eval() + transformer.eval() + + if not high_vram: + vae.enable_slicing() + vae.enable_tiling() + + transformer.high_quality_fp32_output_for_inference = True + print("transformer.high_quality_fp32_output_for_inference = True") + + transformer.to(dtype=torch.bfloat16) + vae.to(dtype=torch.float16) + image_encoder.to(dtype=torch.float16) + text_encoder.to(dtype=torch.float16) + text_encoder_2.to(dtype=torch.float16) + + vae.requires_grad_(False) + text_encoder.requires_grad_(False) + text_encoder_2.requires_grad_(False) + image_encoder.requires_grad_(False) + transformer.requires_grad_(False) + + if not high_vram: + # DynamicSwapInstaller is same as huggingface"s enable_sequential_offload but 3x faster + DynamicSwapInstaller.install_model(transformer, device=gpu) + DynamicSwapInstaller.install_model(text_encoder, device=gpu) + else: + text_encoder.to(gpu) + text_encoder_2.to(gpu) + image_encoder.to(gpu) + vae.to(gpu) + transformer.to(gpu) + + idx = 0 + for dimension in args.dimension_list: + # prepare inputs + + image_folder = f"{args.dataset_location}/vbench2_beta_i2v/data/crop/{args.ratio}" + info_list = json.load(open(f"{args.dataset_location}/vbench2_beta_i2v/vbench2_i2v_full_info.json", "r")) + inputs = [(os.path.join(image_folder, info["image_name"]), info["prompt_en"]) for info in info_list if dimension in info["dimension"]] + for image_path, prompt in inputs: + if args.limit > 0 and idx >= args.limit: + break + + # only sample 1 video for each prompt to evalute quickly + cur_save_path = f"{args.output_video_path}/{prompt}-0.mp4" + + if os.path.exists(cur_save_path): + continue + idx += 1 + # perform sampling + x = worker(image_path, prompt, args.seed, args.total_second_length, args.latent_window_size, args.steps, args.cfg, args.gs, args.rs, args.gpu_memory_preservation, args.use_teacache, args.mp4_crf) + b, c, t, h, w = x.shape + + per_row = b + for p in [6, 5, 4, 3, 2]: + if b % p == 0: + per_row = p + break + + x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5 + x = x.detach().cpu().to(torch.uint8) + video = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row) + torchvision.io.write_video(cur_save_path, video, fps=30, video_codec="h264", options={"crf": "10"}) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt new file mode 100644 index 00000000000..1d6637869b3 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt @@ -0,0 +1,6 @@ +diffusers==0.35.1 +pandas==2.2.2 +clip==0.2.0 +image-reward==1.5 +torchmetrics==1.8.2 +transformers==4.55.0 diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh new file mode 100644 index 00000000000..4ef41900c1e --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -0,0 +1,110 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --ratio=*) + ratio=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --limit=*) + limit=$(echo $var |cut -f2 -d=) + ;; + --output_video_path=*) + output_video_path=$(echo $var |cut -f2 -d=) + ;; + --dimension_list=*) + dimension_list=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + limit=${limit:=-1} + ratio=${ratio:="16-9"} + output_video_path=${output_video_path:="./tmp_videos"} + + if [ "${topology}" = "FP8" ]; then + extra_cmd="--scheme FP8 --quantize --inference" + elif [ "${topology}" = "MXFP8" ]; then + extra_cmd="--scheme MXFP8 --quantize --inference" + elif [ "${topology}" = "BF16" ]; then + extra_cmd="--inference" + fi + + if [ -n "$CUDA_VISIBLE_DEVICES" ]; then + gpu_list="${CUDA_VISIBLE_DEVICES:-}" + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} + echo "visible_gpus: ${visible_gpus}" + + IFS=' ' read -ra dimensions <<< "$dimension_list" + dimension_num=${#dimensions[@]} + if [ "${visible_gpus}" > "${dimension_num}" ]; then + count=${dimension_num} + else + count=${visible_gpus} + sliced=("${dimensions[@]:count-1:dimension_num-visible_gpus}") + dimensions="${sliced[*]}" + fi + + for ((i=0; i Date: Fri, 21 Nov 2025 06:46:51 +0000 Subject: [PATCH 2/6] fix bug and enhance script Signed-off-by: Mengni Wang --- .../diffusers/framepack/README.md | 29 ++++++----- .../diffusers/framepack/main.py | 37 +++++++------ .../diffusers/framepack/requirements.txt | 41 ++++++++++++--- .../diffusers/framepack/run_benchmark.sh | 52 +++++++++++++------ .../torch/algorithms/weight_only/autoround.py | 1 - 5 files changed, 106 insertions(+), 54 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/README.md b/examples/pytorch/diffusion_model/diffusers/framepack/README.md index 496f8f3d4f7..042b8e109bc 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/README.md +++ b/examples/pytorch/diffusion_model/diffusers/framepack/README.md @@ -7,23 +7,23 @@ This example quantizes and validates the accuracy of Flux. ## 1. Environment ```shell +# install zip according to your system +sudo apt update && sudo apt install zip + pip install -r requirements.txt -pip install neural-compressor-pt==3.6 -pip install auto-round==0.8.0 -git clone https://github.com/Vchitect/VBench.git -cd VBench -pip install -r requirements.txt -pip install vbench -cd .. -git clone https://github.com/lllyasviel/FramePack.git -cd FramePack -pip install -r requirements.txt -cd .. +pip install --update neural-compressor-pt +pip install --update auto-round +git clone --depth 1 https://github.com/lllyasviel/FramePack.git +cp -r FramePack/diffusers_helper/ . + +# several models will be downloaded automatically into HF_HOME +export HF_HOME=/path/to/save/model ``` ## 2. Prepare Dataset ```shell +git clone --depth 1 https://github.com/Vchitect/VBench.git cd VBench sh vbench2_beta_i2v/download_data.sh ``` @@ -35,9 +35,11 @@ sh vbench2_beta_i2v/download_data.sh ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 \ bash run_benchmark.sh \ + --topology=BF16 \ --dataset_location=/path/to/VBench \ --output_video_path=bf16_video \ --dimension_list=subject_consistency i2v_background \ + --result_path=bf16_result ``` ## MXFP8 or FP8 @@ -45,10 +47,11 @@ bash run_benchmark.sh \ ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 \ bash run_benchmark.sh \ - --scheme=MXFP8 \ # or FP8 + --topology=MXFP8 \ # or FP8 --dataset_location=/path/to/VBench \ --output_video_path=mxfp8_video \ --dimension_list=subject_consistency i2v_background \ + --result_path=mxfp8_result ``` -- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation +- CUDA_VISIBLE_DEVICES: distribute the dimension_list to different visible GPUs to speed up the evaluation diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py index 3eb70e91abf..686d11b5da6 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/main.py +++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py @@ -53,26 +53,26 @@ parser.add_argument("--quantize", action="store_true") parser.add_argument("--inference", action="store_true") parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model") -parser.add_argument("--dataset_location", type=str, help="Path of cloned VBench repository which contains images and prompts for evaluation") +parser.add_argument("--dataset_location", type=str, help="path of cloned VBench repository which contains images and prompts for evaluation") parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="the directory to save generated videos") parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation") parser.add_argument("--seed", default=31337, type=int, help="random seed") parser.add_argument("--total_second_length", default=5, type=int, help="length of generated video") -parser.add_argument("--steps", default=25, type=float) -parser.add_argument("--cfg", default=1.0, type=float) -parser.add_argument("--gs", default=10.0, type=float) -parser.add_argument("--rs", default=0.0, type=float) +parser.add_argument("--latent_window_size", default=9, type=int) +parser.add_argument("--steps", default=25, type=float, help="number of inference step") +parser.add_argument("--cfg", default=1.0, type=float, help="real guidance scale") +parser.add_argument("--gs", default=10.0, type=float, help="distilled guidance scale") +parser.add_argument("--rs", default=0.0, type=float, help="guidance rescale") parser.add_argument("--gpu_memory_preservation", default=6, type=int) -parser.add_argument("--use_teacache", action="store_true") -parser.add_argument("--mp4_crf", default=16, type=int) +parser.add_argument("--use_teacache", action="store_true", help="faster speed, but often makes hands and fingers slightly worse") +parser.add_argument("--mp4_crf", default=16, type=int, help="MP4 compression. Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs.") parser.add_argument( "--dimension_list", nargs="+", choices=["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "i2v_subject", "i2v_background", "camera_motion"], help="list of evaluation dimensions, usage: --dimension_list ", ) -parser.add_argument("--limit", default=-1, type=int) -parser.add_argument("--ratio", default="16-9", type=str) +parser.add_argument("--ratio", default="16-9", type=str, help="aspect ratio of image") args = parser.parse_args() free_mem_gb = get_cuda_free_memory_gb(gpu) @@ -252,8 +252,11 @@ def callback(d): if __name__ == "__main__": transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained("lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu() + transformer.to(dtype=torch.bfloat16) + transformer.requires_grad_(False) + transformer.eval() + if args.quantize: - print(f"Start to quantize {args.model}.") setattr(transformer, "name_or_path", "lllyasviel/FramePackI2V_HY") qconfig = AutoRoundConfig( @@ -279,7 +282,6 @@ def callback(d): text_encoder.eval() text_encoder_2.eval() image_encoder.eval() - transformer.eval() if not high_vram: vae.enable_slicing() @@ -288,7 +290,6 @@ def callback(d): transformer.high_quality_fp32_output_for_inference = True print("transformer.high_quality_fp32_output_for_inference = True") - transformer.to(dtype=torch.bfloat16) vae.to(dtype=torch.float16) image_encoder.to(dtype=torch.float16) text_encoder.to(dtype=torch.float16) @@ -298,12 +299,11 @@ def callback(d): text_encoder.requires_grad_(False) text_encoder_2.requires_grad_(False) image_encoder.requires_grad_(False) - transformer.requires_grad_(False) if not high_vram: # DynamicSwapInstaller is same as huggingface"s enable_sequential_offload but 3x faster - DynamicSwapInstaller.install_model(transformer, device=gpu) DynamicSwapInstaller.install_model(text_encoder, device=gpu) + DynamicSwapInstaller.install_model(transformer, device=gpu) else: text_encoder.to(gpu) text_encoder_2.to(gpu) @@ -311,14 +311,18 @@ def callback(d): vae.to(gpu) transformer.to(gpu) + if not os.path.exists(args.output_video_path): + os.makedirs(args.output_video_path) + idx = 0 for dimension in args.dimension_list: # prepare inputs - image_folder = f"{args.dataset_location}/vbench2_beta_i2v/data/crop/{args.ratio}" - info_list = json.load(open(f"{args.dataset_location}/vbench2_beta_i2v/vbench2_i2v_full_info.json", "r")) + image_folder = os.path.join(args.dataset_location, f"vbench2_beta_i2v/data/crop/{args.ratio}") + info_list = json.load(open(os.path.join(args.dataset_location, "vbench2_beta_i2v/vbench2_i2v_full_info.json"), "r")) inputs = [(os.path.join(image_folder, info["image_name"]), info["prompt_en"]) for info in info_list if dimension in info["dimension"]] for image_path, prompt in inputs: + idx += 1 if args.limit > 0 and idx >= args.limit: break @@ -327,7 +331,6 @@ def callback(d): if os.path.exists(cur_save_path): continue - idx += 1 # perform sampling x = worker(image_path, prompt, args.seed, args.total_second_length, args.latent_window_size, args.steps, args.cfg, args.gs, args.rs, args.gpu_memory_preservation, args.use_teacache, args.mp4_crf) b, c, t, h, w = x.shape diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt index 1d6637869b3..06e80a0af7f 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt @@ -1,6 +1,35 @@ -diffusers==0.35.1 -pandas==2.2.2 -clip==0.2.0 -image-reward==1.5 -torchmetrics==1.8.2 -transformers==4.55.0 +Pillow +matplotlib +timm>=0.9,<=1.0.12 +wheel +cython +tensorboard +scipy +scikit-learn +scikit-image +openai-clip +decord +requests +pyyaml +easydict +pyiqa +lvis +fairscale>=0.4.4 +fvcore +easydict +urllib3 +boto3 +omegaconf +transformers +pycocoevalcap +detectron2@git+https://github.com/facebookresearch/detectron2.git +accelerate +diffusers +sentencepiece==0.2.0 +av==12.1.0 +torchsde==0.2.6 +einops +safetensors +opencv-python-headless +dreamsim +numpy<2.0.0 diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh index 4ef41900c1e..91f0babe92e 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -31,6 +31,9 @@ function init_params { --output_video_path=*) output_video_path=$(echo $var |cut -f2 -d=) ;; + --result_path=*) + result_path=$(echo $var |cut -f2 -d=) + ;; --dimension_list=*) dimension_list=$(echo $var |cut -f2 -d=) ;; @@ -49,6 +52,15 @@ function run_benchmark { limit=${limit:=-1} ratio=${ratio:="16-9"} output_video_path=${output_video_path:="./tmp_videos"} + result_path=${result_path:="./eval_result"} + + if [[ ! "${result_path}" = /* ]]; then + result_path=$(realpath -s "$(pwd)/$result_path") + fi + + if [[ ! "${output_video_path}" = /* ]]; then + output_video_path=$(realpath -s "$(pwd)/$output_video_path") + fi if [ "${topology}" = "FP8" ]; then extra_cmd="--scheme FP8 --quantize --inference" @@ -60,51 +72,57 @@ function run_benchmark { if [ -n "$CUDA_VISIBLE_DEVICES" ]; then gpu_list="${CUDA_VISIBLE_DEVICES:-}" - IFS=',' read -ra gpu_ids <<< "$gpu_list" - visible_gpus=${#gpu_ids[@]} - echo "visible_gpus: ${visible_gpus}" + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} + echo "visible_gpus: ${visible_gpus}" IFS=' ' read -ra dimensions <<< "$dimension_list" dimension_num=${#dimensions[@]} - if [ "${visible_gpus}" > "${dimension_num}" ]; then + if [ "${visible_gpus}" -gt "${dimension_num}" ]; then count=${dimension_num} else count=${visible_gpus} - sliced=("${dimensions[@]:count-1:dimension_num-visible_gpus}") - dimensions="${sliced[*]}" + left=${dimensions[@]:count-1:dimension_num} + dimensions=("${dimensions[@]:0:count-1}" "$left") fi for ((i=0; i&1) + result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') + + echo "Evaluation results saved to ${result_file}" + zip -r "${result_path}.zip" ${result_path} + python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" + } main "$@" + diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 5fa3b253cfa..2342f9f5b84 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -268,7 +268,6 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): if tokenizer is not None: delattr(model.orig_model, "tokenizer") elif pipe is None: - tokenizer = "Placeholder" self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) model = model.orig_model if pipe is not None: From a0a80cfcf4e000d62bc3632161f3c5f8eaaf20de Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Nov 2025 06:49:39 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/pytorch/diffusion_model/diffusers/framepack/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py index 686d11b5da6..51584f2133e 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/main.py +++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py @@ -326,7 +326,7 @@ def callback(d): if args.limit > 0 and idx >= args.limit: break - # only sample 1 video for each prompt to evalute quickly + # only sample 1 video for each prompt to evaluate quickly cur_save_path = f"{args.output_video_path}/{prompt}-0.mp4" if os.path.exists(cur_save_path): From b37f428ef0a5231067997c4b81d3e46f2450e599 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Fri, 21 Nov 2025 08:31:53 +0000 Subject: [PATCH 4/6] update script Signed-off-by: Mengni Wang --- .../diffusers/framepack/run_benchmark.sh | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh index 91f0babe92e..d22d03be70c 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -80,20 +80,22 @@ function run_benchmark { dimension_num=${#dimensions[@]} if [ "${visible_gpus}" -gt "${dimension_num}" ]; then count=${dimension_num} + step=1 else count=${visible_gpus} - left=${dimensions[@]:count-1:dimension_num} - dimensions=("${dimensions[@]:0:count-1}" "$left") + step=$((dimension_num/visible_gpus)) + left=${dimensions[@]:step*count-1:dimension_num} + dimensions=("${dimensions[@]:0:step*count-1}" "$left") fi for ((i=0; i&1) - result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') - - echo "Evaluation results saved to ${result_file}" - zip -r "${result_path}.zip" ${result_path} - python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" + #echo "Start calculating final score..." + #cd ${dataset_location} + #output=$(python evaluate_i2v.py \ + # --videos_path ${output_video_path} \ + # --dimension ${dimension_list} \ + # --output_path ${result_path} \ + # --ratio ${ratio} 2>&1) + #result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') + + #echo "Evaluation results saved to ${result_file}" + #zip -r "${result_path}.zip" ${result_path} + #python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" } From c657c0b86d69692780b48f736a1c608d5a48f353 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 21 Nov 2025 16:34:05 +0800 Subject: [PATCH 5/6] Update run_benchmark.sh --- .../diffusion_model/diffusers/framepack/run_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh index d22d03be70c..8b090297ac8 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -80,10 +80,10 @@ function run_benchmark { dimension_num=${#dimensions[@]} if [ "${visible_gpus}" -gt "${dimension_num}" ]; then count=${dimension_num} - step=1 + step=1 else count=${visible_gpus} - step=$((dimension_num/visible_gpus)) + step=$((dimension_num/visible_gpus)) left=${dimensions[@]:step*count-1:dimension_num} dimensions=("${dimensions[@]:0:step*count-1}" "$left") fi From a231251ba74d3ddf55ae9b6027f42da4998140ab Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 21 Nov 2025 16:35:01 +0800 Subject: [PATCH 6/6] Update run_benchmark.sh --- .../diffusers/framepack/run_benchmark.sh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh index 8b090297ac8..71dc61b38d9 100644 --- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -111,18 +111,18 @@ function run_benchmark { ${extra_cmd} fi - #echo "Start calculating final score..." - #cd ${dataset_location} - #output=$(python evaluate_i2v.py \ - # --videos_path ${output_video_path} \ - # --dimension ${dimension_list} \ - # --output_path ${result_path} \ - # --ratio ${ratio} 2>&1) - #result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') - - #echo "Evaluation results saved to ${result_file}" - #zip -r "${result_path}.zip" ${result_path} - #python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" + echo "Start calculating final score..." + cd ${dataset_location} + output=$(python evaluate_i2v.py \ + --videos_path ${output_video_path} \ + --dimension ${dimension_list} \ + --output_path ${result_path} \ + --ratio ${ratio} 2>&1) + result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') + + echo "Evaluation results saved to ${result_file}" + zip -r "${result_path}.zip" ${result_path} + python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" }