From 5dba7d4899ae7e57fe451338521fce041030ea90 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@nitel.com>
Date: Tue, 18 Nov 2025 03:47:58 -0500
Subject: [PATCH 1/6] draft

Signed-off-by: Mengni Wang <mengni.wang@nitel.com>
---
 .../diffusers/framepack/README.md             |  54 +++
 .../diffusers/framepack/main.py               | 344 ++++++++++++++++++
 .../diffusers/framepack/requirements.txt      |   6 +
 .../diffusers/framepack/run_benchmark.sh      | 110 ++++++
 4 files changed, 514 insertions(+)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/README.md
 create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/main.py
 create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
 create mode 100644 examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/README.md b/examples/pytorch/diffusion_model/diffusers/framepack/README.md
new file mode 100644
index 00000000000..496f8f3d4f7
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/README.md
@@ -0,0 +1,54 @@
+# Step-by-Step
+
+This example quantizes and validates the accuracy of Flux.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install -r requirements.txt
+pip install neural-compressor-pt==3.6
+pip install auto-round==0.8.0
+git clone https://github.com/Vchitect/VBench.git
+cd VBench
+pip install -r requirements.txt
+pip install vbench
+cd ..
+git clone https://github.com/lllyasviel/FramePack.git
+cd FramePack
+pip install -r requirements.txt
+cd ..
+```
+
+## 2. Prepare Dataset
+
+```shell
+cd VBench
+sh vbench2_beta_i2v/download_data.sh
+```
+
+# Run
+
+## BF16
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+bash run_benchmark.sh \
+    --dataset_location=/path/to/VBench \
+    --output_video_path=bf16_video \
+    --dimension_list=subject_consistency i2v_background \
+```
+
+## MXFP8 or FP8 
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+bash run_benchmark.sh \
+    --scheme=MXFP8 \ # or FP8
+    --dataset_location=/path/to/VBench \
+    --output_video_path=mxfp8_video \
+    --dimension_list=subject_consistency i2v_background \
+```
+
+- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation 
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
new file mode 100644
index 00000000000..3eb70e91abf
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+import argparse
+
+import torch
+
+from neural_compressor.torch.quantization import (
+    AutoRoundConfig,
+    convert,
+    prepare,
+)
+from PIL import Image
+from diffusers import AutoencoderKLHunyuanVideo
+from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
+from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
+from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
+from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
+from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
+from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from diffusers_helper.clip_vision import hf_clip_vision_encode
+from diffusers_helper.bucket_tools import find_nearest_bucket
+import torch
+from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
+from auto_round import AutoRound
+import json
+import torchvision
+import torch
+import einops
+import numpy as np
+import argparse
+
+
+parser = argparse.ArgumentParser(
+    description="FramePack quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument("--scheme", default="MXFP8", type=str, help="quantizaion scheme.")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument("--inference", action="store_true")
+parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model")
+parser.add_argument("--dataset_location", type=str, help="Path of cloned VBench repository which contains images and prompts for evaluation")
+parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="the directory to save generated videos")
+parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation")
+parser.add_argument("--seed", default=31337, type=int, help="random seed")
+parser.add_argument("--total_second_length", default=5, type=int, help="length of generated video")
+parser.add_argument("--steps", default=25, type=float)
+parser.add_argument("--cfg", default=1.0, type=float)
+parser.add_argument("--gs", default=10.0, type=float)
+parser.add_argument("--rs", default=0.0, type=float)
+parser.add_argument("--gpu_memory_preservation", default=6, type=int)
+parser.add_argument("--use_teacache", action="store_true")
+parser.add_argument("--mp4_crf", default=16, type=int)
+parser.add_argument(
+    "--dimension_list",
+    nargs="+",
+    choices=["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "i2v_subject", "i2v_background", "camera_motion"],
+    help="list of evaluation dimensions, usage: --dimension_list <dim_1> <dim_2>",
+)
+parser.add_argument("--limit", default=-1, type=int)
+parser.add_argument("--ratio", default="16-9", type=str)
+
+args = parser.parse_args()
+free_mem_gb = get_cuda_free_memory_gb(gpu)
+high_vram = free_mem_gb > 60
+
+@torch.no_grad()
+def worker(input_image, prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
+    input_image = Image.open(input_image).convert("RGB")
+    input_image = np.array(input_image)
+    total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+
+    # Clean GPU
+    if not high_vram:
+        unload_complete_models(
+            text_encoder, text_encoder_2, image_encoder, vae, transformer
+        )
+
+    # Text encoding
+
+    if not high_vram:
+        fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
+        load_model_as_complete(text_encoder_2, target_device=gpu)
+
+    llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
+
+    llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+
+    llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+    llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+    # Processing input image
+
+    H, W, C = input_image.shape
+    height, width = find_nearest_bucket(H, W, resolution=640)
+    input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
+
+    input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
+    input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
+
+    # VAE encoding
+
+    if not high_vram:
+        load_model_as_complete(vae, target_device=gpu)
+
+    start_latent = vae_encode(input_image_pt, vae)
+
+    # CLIP Vision
+
+    if not high_vram:
+        load_model_as_complete(image_encoder, target_device=gpu)
+
+    image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
+    image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
+
+    # Dtype
+
+    llama_vec = llama_vec.to(transformer.dtype)
+    llama_vec_n = llama_vec_n.to(transformer.dtype)
+    clip_l_pooler = clip_l_pooler.to(transformer.dtype)
+    clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
+    image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
+
+    # Sampling
+
+    rnd = torch.Generator("cpu").manual_seed(seed)
+    num_frames = latent_window_size * 4 - 3
+
+    history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
+    history_pixels = None
+    total_generated_latent_frames = 0
+
+    latent_paddings = reversed(range(total_latent_sections))
+
+    if total_latent_sections > 4:
+        # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
+        # items looks better than expanding it when total_latent_sections > 4
+        # One can try to remove below trick and just
+        # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+        latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+
+    for latent_padding in latent_paddings:
+        is_last_section = latent_padding == 0
+        latent_padding_size = latent_padding * latent_window_size
+
+        print(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+
+        indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+        clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+        clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+
+        clean_latents_pre = start_latent.to(history_latents)
+        clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
+        clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+
+        if not high_vram:
+            unload_complete_models()
+            move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
+
+
+        if use_teacache:
+            transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+        else:
+            transformer.initialize_teacache(enable_teacache=False)
+
+        def callback(d):
+            preview = d["denoised"]
+            preview = vae_decode_fake(preview)
+
+            preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+            preview = einops.rearrange(preview, "b c t h w -> (b h) (t w) c")
+
+            current_step = d["i"] + 1
+            hint = f"Sampling {current_step}/{steps}"
+            desc = f"Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ..."
+            print(hint, desc)
+            return
+
+        generated_latents = sample_hunyuan(
+            transformer=transformer,
+            sampler="unipc",
+            width=width,
+            height=height,
+            frames=num_frames,
+            real_guidance_scale=cfg,
+            distilled_guidance_scale=gs,
+            guidance_rescale=rs,
+            # shift=3.0,
+            num_inference_steps=steps,
+            generator=rnd,
+            prompt_embeds=llama_vec,
+            prompt_embeds_mask=llama_attention_mask,
+            prompt_poolers=clip_l_pooler,
+            negative_prompt_embeds=llama_vec_n,
+            negative_prompt_embeds_mask=llama_attention_mask_n,
+            negative_prompt_poolers=clip_l_pooler_n,
+            device=gpu,
+            dtype=torch.bfloat16,
+            image_embeddings=image_encoder_last_hidden_state,
+            latent_indices=latent_indices,
+            clean_latents=clean_latents,
+            clean_latent_indices=clean_latent_indices,
+            clean_latents_2x=clean_latents_2x,
+            clean_latent_2x_indices=clean_latent_2x_indices,
+            clean_latents_4x=clean_latents_4x,
+            clean_latent_4x_indices=clean_latent_4x_indices,
+            callback=callback,
+        )
+        if is_last_section:
+            generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+
+        total_generated_latent_frames += int(generated_latents.shape[2])
+        history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+
+        if not high_vram:
+            offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
+            load_model_as_complete(vae, target_device=gpu)
+
+        real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+
+        if history_pixels is None:
+            history_pixels = vae_decode(real_history_latents, vae).cpu()
+        else:
+            section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            overlapped_frames = latent_window_size * 4 - 3
+
+            current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
+            history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+
+        if not high_vram:
+            unload_complete_models()
+
+        print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
+
+        if is_last_section:
+            break
+    return history_pixels
+
+if __name__ == "__main__":
+    transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained("lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu()
+    if args.quantize:
+        print(f"Start to quantize {args.model}.")
+        setattr(transformer, "name_or_path", "lllyasviel/FramePackI2V_HY")
+
+        qconfig = AutoRoundConfig(
+            scheme=args.scheme,
+            iters=0,
+            export_format="fake",
+            output_dir=args.output_dir,
+        )
+        transformer = prepare(transformer, qconfig)
+        transformer = convert(transformer, qconfig)
+
+    if args.inference:
+        text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder", torch_dtype=torch.float16).cpu()
+        text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder_2", torch_dtype=torch.float16).cpu()
+        tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
+        vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=torch.float16).cpu()
+
+        feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="feature_extractor")
+        image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16).cpu()
+
+        vae.eval()
+        text_encoder.eval()
+        text_encoder_2.eval()
+        image_encoder.eval()
+        transformer.eval()
+
+        if not high_vram:
+            vae.enable_slicing()
+            vae.enable_tiling()
+
+        transformer.high_quality_fp32_output_for_inference = True
+        print("transformer.high_quality_fp32_output_for_inference = True")
+
+        transformer.to(dtype=torch.bfloat16)
+        vae.to(dtype=torch.float16)
+        image_encoder.to(dtype=torch.float16)
+        text_encoder.to(dtype=torch.float16)
+        text_encoder_2.to(dtype=torch.float16)
+
+        vae.requires_grad_(False)
+        text_encoder.requires_grad_(False)
+        text_encoder_2.requires_grad_(False)
+        image_encoder.requires_grad_(False)
+        transformer.requires_grad_(False)
+
+        if not high_vram:
+            # DynamicSwapInstaller is same as huggingface"s enable_sequential_offload but 3x faster
+            DynamicSwapInstaller.install_model(transformer, device=gpu)
+            DynamicSwapInstaller.install_model(text_encoder, device=gpu)
+        else:
+            text_encoder.to(gpu)
+            text_encoder_2.to(gpu)
+            image_encoder.to(gpu)
+            vae.to(gpu)
+            transformer.to(gpu)
+
+        idx = 0
+        for dimension in args.dimension_list:
+            # prepare inputs
+
+            image_folder = f"{args.dataset_location}/vbench2_beta_i2v/data/crop/{args.ratio}"
+            info_list = json.load(open(f"{args.dataset_location}/vbench2_beta_i2v/vbench2_i2v_full_info.json", "r"))
+            inputs = [(os.path.join(image_folder, info["image_name"]), info["prompt_en"]) for info in info_list if dimension in info["dimension"]]
+            for image_path, prompt in inputs:
+                if args.limit > 0 and idx >= args.limit:
+                    break
+
+                # only sample 1 video for each prompt to evalute quickly
+                cur_save_path = f"{args.output_video_path}/{prompt}-0.mp4"
+
+                if os.path.exists(cur_save_path):
+                    continue
+                idx += 1
+                # perform sampling
+                x = worker(image_path, prompt, args.seed, args.total_second_length, args.latent_window_size, args.steps, args.cfg, args.gs, args.rs, args.gpu_memory_preservation, args.use_teacache, args.mp4_crf)
+                b, c, t, h, w = x.shape
+
+                per_row = b
+                for p in [6, 5, 4, 3, 2]:
+                    if b % p == 0:
+                        per_row = p
+                        break
+
+                x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+                x = x.detach().cpu().to(torch.uint8)
+                video = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row)
+                torchvision.io.write_video(cur_save_path, video, fps=30, video_codec="h264", options={"crf": "10"})
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
new file mode 100644
index 00000000000..1d6637869b3
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
@@ -0,0 +1,6 @@
+diffusers==0.35.1
+pandas==2.2.2
+clip==0.2.0
+image-reward==1.5
+torchmetrics==1.8.2
+transformers==4.55.0
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
new file mode 100644
index 00000000000..4ef41900c1e
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --ratio=*)
+          ratio=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --limit=*)
+          limit=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_video_path=*)
+          output_video_path=$(echo $var |cut -f2 -d=)
+      ;;
+      --dimension_list=*)
+          dimension_list=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    limit=${limit:=-1}
+    ratio=${ratio:="16-9"}
+    output_video_path=${output_video_path:="./tmp_videos"}
+
+    if [ "${topology}" = "FP8" ]; then
+        extra_cmd="--scheme FP8 --quantize --inference"
+    elif [ "${topology}" = "MXFP8" ]; then
+        extra_cmd="--scheme MXFP8 --quantize --inference"
+    elif [ "${topology}" = "BF16" ]; then
+        extra_cmd="--inference"
+    fi
+
+    if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
+        gpu_list="${CUDA_VISIBLE_DEVICES:-}"
+	    IFS=',' read -ra gpu_ids <<< "$gpu_list"
+	    visible_gpus=${#gpu_ids[@]}
+		echo "visible_gpus: ${visible_gpus}"
+
+        IFS=' ' read -ra dimensions <<< "$dimension_list"
+        dimension_num=${#dimensions[@]}
+        if [ "${visible_gpus}" > "${dimension_num}" ]; then
+            count=${dimension_num}
+        else
+            count=${visible_gpus}
+            sliced=("${dimensions[@]:count-1:dimension_num-visible_gpus}")
+            dimensions="${sliced[*]}"
+        fi
+
+        for ((i=0; i<count; i++)); do
+            export CUDA_VISIBLE_DEVICES=${gpu_ids[i]}
+            python3 main.py \
+                --model ${input_model} \
+                --output_video_path ${output_video_path} \
+		        --dataset_location ${dataset_location} \
+                --ratio ${ratio} \
+                --limit ${limit} \
+                --dimension_list ${dimensions[i]}
+                ${extra_cmd} &
+            program_pid+=($!)
+	        echo "Start (PID: ${program_pid[-1]}, GPU: ${i})"
+        done
+	    wait "${program_pid[@]}"
+    else
+        python3 main.py \
+            --model ${input_model} \
+            --output_video_path ${output_video_path} \
+		    --dataset_location ${dataset_location} \
+			--limit ${limit} \
+            --ratio ${ratio} \
+            --dimension_list ${dimension_list} \
+            ${extra_cmd}
+    fi
+
+	echo "Start calculating final score..."
+    cd ${dataset_location}
+    python evaluate_i2v.py \
+        --videos_path ${output_video_path} \
+        --dimension ${dimension_list} \
+        --ratio ${ratio}
+}
+
+main "$@"

From bb1d4658f04e9073701706ca059119c4460d65e9 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 21 Nov 2025 06:46:51 +0000
Subject: [PATCH 2/6] fix bug and enhance script

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../diffusers/framepack/README.md             | 29 ++++++-----
 .../diffusers/framepack/main.py               | 37 +++++++------
 .../diffusers/framepack/requirements.txt      | 41 ++++++++++++---
 .../diffusers/framepack/run_benchmark.sh      | 52 +++++++++++++------
 .../torch/algorithms/weight_only/autoround.py |  1 -
 5 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/README.md b/examples/pytorch/diffusion_model/diffusers/framepack/README.md
index 496f8f3d4f7..042b8e109bc 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/README.md
@@ -7,23 +7,23 @@ This example quantizes and validates the accuracy of Flux.
 ## 1. Environment
 
 ```shell
+# install zip according to your system
+sudo apt update && sudo apt install zip
+
 pip install -r requirements.txt
-pip install neural-compressor-pt==3.6
-pip install auto-round==0.8.0
-git clone https://github.com/Vchitect/VBench.git
-cd VBench
-pip install -r requirements.txt
-pip install vbench
-cd ..
-git clone https://github.com/lllyasviel/FramePack.git
-cd FramePack
-pip install -r requirements.txt
-cd ..
+pip install --update neural-compressor-pt
+pip install --update auto-round
+git clone --depth 1 https://github.com/lllyasviel/FramePack.git
+cp -r FramePack/diffusers_helper/ .
+
+# several models will be downloaded automatically into HF_HOME
+export HF_HOME=/path/to/save/model
 ```
 
 ## 2. Prepare Dataset
 
 ```shell
+git clone --depth 1 https://github.com/Vchitect/VBench.git
 cd VBench
 sh vbench2_beta_i2v/download_data.sh
 ```
@@ -35,9 +35,11 @@ sh vbench2_beta_i2v/download_data.sh
 ```bash
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 bash run_benchmark.sh \
+    --topology=BF16 \
     --dataset_location=/path/to/VBench \
     --output_video_path=bf16_video \
     --dimension_list=subject_consistency i2v_background \
+    --result_path=bf16_result
 ```
 
 ## MXFP8 or FP8 
@@ -45,10 +47,11 @@ bash run_benchmark.sh \
 ```bash
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 bash run_benchmark.sh \
-    --scheme=MXFP8 \ # or FP8
+    --topology=MXFP8 \ # or FP8
     --dataset_location=/path/to/VBench \
     --output_video_path=mxfp8_video \
     --dimension_list=subject_consistency i2v_background \
+    --result_path=mxfp8_result
 ```
 
-- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation 
+- CUDA_VISIBLE_DEVICES: distribute the dimension_list to different visible GPUs to speed up the evaluation
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
index 3eb70e91abf..686d11b5da6 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
@@ -53,26 +53,26 @@
 parser.add_argument("--quantize", action="store_true")
 parser.add_argument("--inference", action="store_true")
 parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model")
-parser.add_argument("--dataset_location", type=str, help="Path of cloned VBench repository which contains images and prompts for evaluation")
+parser.add_argument("--dataset_location", type=str, help="path of cloned VBench repository which contains images and prompts for evaluation")
 parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="the directory to save generated videos")
 parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation")
 parser.add_argument("--seed", default=31337, type=int, help="random seed")
 parser.add_argument("--total_second_length", default=5, type=int, help="length of generated video")
-parser.add_argument("--steps", default=25, type=float)
-parser.add_argument("--cfg", default=1.0, type=float)
-parser.add_argument("--gs", default=10.0, type=float)
-parser.add_argument("--rs", default=0.0, type=float)
+parser.add_argument("--latent_window_size", default=9, type=int)
+parser.add_argument("--steps", default=25, type=float, help="number of inference step")
+parser.add_argument("--cfg", default=1.0, type=float, help="real guidance scale")
+parser.add_argument("--gs", default=10.0, type=float, help="distilled guidance scale")
+parser.add_argument("--rs", default=0.0, type=float, help="guidance rescale")
 parser.add_argument("--gpu_memory_preservation", default=6, type=int)
-parser.add_argument("--use_teacache", action="store_true")
-parser.add_argument("--mp4_crf", default=16, type=int)
+parser.add_argument("--use_teacache", action="store_true", help="faster speed, but often makes hands and fingers slightly worse")
+parser.add_argument("--mp4_crf", default=16, type=int, help="MP4 compression. Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs.")
 parser.add_argument(
     "--dimension_list",
     nargs="+",
     choices=["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "i2v_subject", "i2v_background", "camera_motion"],
     help="list of evaluation dimensions, usage: --dimension_list <dim_1> <dim_2>",
 )
-parser.add_argument("--limit", default=-1, type=int)
-parser.add_argument("--ratio", default="16-9", type=str)
+parser.add_argument("--ratio", default="16-9", type=str, help="aspect ratio of image")
 
 args = parser.parse_args()
 free_mem_gb = get_cuda_free_memory_gb(gpu)
@@ -252,8 +252,11 @@ def callback(d):
 
 if __name__ == "__main__":
     transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained("lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu()
+    transformer.to(dtype=torch.bfloat16)
+    transformer.requires_grad_(False)
+    transformer.eval()
+
     if args.quantize:
-        print(f"Start to quantize {args.model}.")
         setattr(transformer, "name_or_path", "lllyasviel/FramePackI2V_HY")
 
         qconfig = AutoRoundConfig(
@@ -279,7 +282,6 @@ def callback(d):
         text_encoder.eval()
         text_encoder_2.eval()
         image_encoder.eval()
-        transformer.eval()
 
         if not high_vram:
             vae.enable_slicing()
@@ -288,7 +290,6 @@ def callback(d):
         transformer.high_quality_fp32_output_for_inference = True
         print("transformer.high_quality_fp32_output_for_inference = True")
 
-        transformer.to(dtype=torch.bfloat16)
         vae.to(dtype=torch.float16)
         image_encoder.to(dtype=torch.float16)
         text_encoder.to(dtype=torch.float16)
@@ -298,12 +299,11 @@ def callback(d):
         text_encoder.requires_grad_(False)
         text_encoder_2.requires_grad_(False)
         image_encoder.requires_grad_(False)
-        transformer.requires_grad_(False)
 
         if not high_vram:
             # DynamicSwapInstaller is same as huggingface"s enable_sequential_offload but 3x faster
-            DynamicSwapInstaller.install_model(transformer, device=gpu)
             DynamicSwapInstaller.install_model(text_encoder, device=gpu)
+            DynamicSwapInstaller.install_model(transformer, device=gpu)
         else:
             text_encoder.to(gpu)
             text_encoder_2.to(gpu)
@@ -311,14 +311,18 @@ def callback(d):
             vae.to(gpu)
             transformer.to(gpu)
 
+        if not os.path.exists(args.output_video_path):
+            os.makedirs(args.output_video_path)
+
         idx = 0
         for dimension in args.dimension_list:
             # prepare inputs
 
-            image_folder = f"{args.dataset_location}/vbench2_beta_i2v/data/crop/{args.ratio}"
-            info_list = json.load(open(f"{args.dataset_location}/vbench2_beta_i2v/vbench2_i2v_full_info.json", "r"))
+            image_folder = os.path.join(args.dataset_location, f"vbench2_beta_i2v/data/crop/{args.ratio}")
+            info_list = json.load(open(os.path.join(args.dataset_location, "vbench2_beta_i2v/vbench2_i2v_full_info.json"), "r"))
             inputs = [(os.path.join(image_folder, info["image_name"]), info["prompt_en"]) for info in info_list if dimension in info["dimension"]]
             for image_path, prompt in inputs:
+                idx += 1
                 if args.limit > 0 and idx >= args.limit:
                     break
 
@@ -327,7 +331,6 @@ def callback(d):
 
                 if os.path.exists(cur_save_path):
                     continue
-                idx += 1
                 # perform sampling
                 x = worker(image_path, prompt, args.seed, args.total_second_length, args.latent_window_size, args.steps, args.cfg, args.gs, args.rs, args.gpu_memory_preservation, args.use_teacache, args.mp4_crf)
                 b, c, t, h, w = x.shape
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
index 1d6637869b3..06e80a0af7f 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt
@@ -1,6 +1,35 @@
-diffusers==0.35.1
-pandas==2.2.2
-clip==0.2.0
-image-reward==1.5
-torchmetrics==1.8.2
-transformers==4.55.0
+Pillow
+matplotlib
+timm>=0.9,<=1.0.12
+wheel
+cython
+tensorboard
+scipy
+scikit-learn
+scikit-image
+openai-clip
+decord
+requests
+pyyaml
+easydict
+pyiqa
+lvis
+fairscale>=0.4.4
+fvcore
+easydict
+urllib3
+boto3
+omegaconf
+transformers
+pycocoevalcap
+detectron2@git+https://github.com/facebookresearch/detectron2.git
+accelerate
+diffusers
+sentencepiece==0.2.0
+av==12.1.0
+torchsde==0.2.6
+einops
+safetensors
+opencv-python-headless
+dreamsim
+numpy<2.0.0
diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
index 4ef41900c1e..91f0babe92e 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
@@ -31,6 +31,9 @@ function init_params {
       --output_video_path=*)
           output_video_path=$(echo $var |cut -f2 -d=)
       ;;
+      --result_path=*)
+          result_path=$(echo $var |cut -f2 -d=)
+      ;;
       --dimension_list=*)
           dimension_list=$(echo $var |cut -f2 -d=)
       ;;
@@ -49,6 +52,15 @@ function run_benchmark {
     limit=${limit:=-1}
     ratio=${ratio:="16-9"}
     output_video_path=${output_video_path:="./tmp_videos"}
+    result_path=${result_path:="./eval_result"}
+
+    if [[ ! "${result_path}" = /* ]]; then
+        result_path=$(realpath -s "$(pwd)/$result_path")
+    fi
+
+    if [[ ! "${output_video_path}" = /* ]]; then
+        output_video_path=$(realpath -s "$(pwd)/$output_video_path")
+    fi
 
     if [ "${topology}" = "FP8" ]; then
         extra_cmd="--scheme FP8 --quantize --inference"
@@ -60,51 +72,57 @@ function run_benchmark {
 
     if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
         gpu_list="${CUDA_VISIBLE_DEVICES:-}"
-	    IFS=',' read -ra gpu_ids <<< "$gpu_list"
-	    visible_gpus=${#gpu_ids[@]}
-		echo "visible_gpus: ${visible_gpus}"
+        IFS=',' read -ra gpu_ids <<< "$gpu_list"
+        visible_gpus=${#gpu_ids[@]}
+        echo "visible_gpus: ${visible_gpus}"
 
         IFS=' ' read -ra dimensions <<< "$dimension_list"
         dimension_num=${#dimensions[@]}
-        if [ "${visible_gpus}" > "${dimension_num}" ]; then
+        if [ "${visible_gpus}" -gt "${dimension_num}" ]; then
             count=${dimension_num}
         else
             count=${visible_gpus}
-            sliced=("${dimensions[@]:count-1:dimension_num-visible_gpus}")
-            dimensions="${sliced[*]}"
+            left=${dimensions[@]:count-1:dimension_num}
+            dimensions=("${dimensions[@]:0:count-1}" "$left")
         fi
 
         for ((i=0; i<count; i++)); do
             export CUDA_VISIBLE_DEVICES=${gpu_ids[i]}
             python3 main.py \
-                --model ${input_model} \
                 --output_video_path ${output_video_path} \
-		        --dataset_location ${dataset_location} \
+        	--dataset_location ${dataset_location} \
                 --ratio ${ratio} \
                 --limit ${limit} \
-                --dimension_list ${dimensions[i]}
+                --dimension_list ${dimensions[i]} \
                 ${extra_cmd} &
             program_pid+=($!)
-	        echo "Start (PID: ${program_pid[-1]}, GPU: ${i})"
+            echo "Start (PID: ${program_pid[-1]}, GPU: ${i})"
         done
-	    wait "${program_pid[@]}"
+        wait "${program_pid[@]}"
     else
         python3 main.py \
-            --model ${input_model} \
             --output_video_path ${output_video_path} \
-		    --dataset_location ${dataset_location} \
-			--limit ${limit} \
+            --dataset_location ${dataset_location} \
+            --limit ${limit} \
             --ratio ${ratio} \
             --dimension_list ${dimension_list} \
             ${extra_cmd}
     fi
 
-	echo "Start calculating final score..."
+    echo "Start calculating final score..."
     cd ${dataset_location}
-    python evaluate_i2v.py \
+    output=$(python evaluate_i2v.py \
         --videos_path ${output_video_path} \
         --dimension ${dimension_list} \
-        --ratio ${ratio}
+        --output_path ${result_path} \
+        --ratio ${ratio} 2>&1)
+    result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}')
+
+    echo "Evaluation results saved to ${result_file}"
+    zip -r "${result_path}.zip" ${result_path}
+    python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack"
+
 }
 
 main "$@"
+
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 5fa3b253cfa..2342f9f5b84 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -268,7 +268,6 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         if tokenizer is not None:
             delattr(model.orig_model, "tokenizer")
         elif pipe is None:
-            tokenizer = "Placeholder"
             self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
         model = model.orig_model
         if pipe is not None:

From a0a80cfcf4e000d62bc3632161f3c5f8eaaf20de Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 21 Nov 2025 06:49:39 +0000
Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/pytorch/diffusion_model/diffusers/framepack/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
index 686d11b5da6..51584f2133e 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py
@@ -326,7 +326,7 @@ def callback(d):
                 if args.limit > 0 and idx >= args.limit:
                     break
 
-                # only sample 1 video for each prompt to evalute quickly
+                # only sample 1 video for each prompt to evaluate quickly
                 cur_save_path = f"{args.output_video_path}/{prompt}-0.mp4"
 
                 if os.path.exists(cur_save_path):

From b37f428ef0a5231067997c4b81d3e46f2450e599 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 21 Nov 2025 08:31:53 +0000
Subject: [PATCH 4/6] update script

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../diffusers/framepack/run_benchmark.sh      | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
index 91f0babe92e..d22d03be70c 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
@@ -80,20 +80,22 @@ function run_benchmark {
         dimension_num=${#dimensions[@]}
         if [ "${visible_gpus}" -gt "${dimension_num}" ]; then
             count=${dimension_num}
+	    step=1
         else
             count=${visible_gpus}
-            left=${dimensions[@]:count-1:dimension_num}
-            dimensions=("${dimensions[@]:0:count-1}" "$left")
+	    step=$((dimension_num/visible_gpus))
+            left=${dimensions[@]:step*count-1:dimension_num}
+            dimensions=("${dimensions[@]:0:step*count-1}" "$left")
         fi
 
         for ((i=0; i<count; i++)); do
             export CUDA_VISIBLE_DEVICES=${gpu_ids[i]}
             python3 main.py \
                 --output_video_path ${output_video_path} \
-        	--dataset_location ${dataset_location} \
+                --dataset_location ${dataset_location} \
                 --ratio ${ratio} \
                 --limit ${limit} \
-                --dimension_list ${dimensions[i]} \
+                --dimension_list ${dimensions[@]:i*step:(i+1)*step} \
                 ${extra_cmd} &
             program_pid+=($!)
             echo "Start (PID: ${program_pid[-1]}, GPU: ${i})"
@@ -109,18 +111,18 @@ function run_benchmark {
             ${extra_cmd}
     fi
 
-    echo "Start calculating final score..."
-    cd ${dataset_location}
-    output=$(python evaluate_i2v.py \
-        --videos_path ${output_video_path} \
-        --dimension ${dimension_list} \
-        --output_path ${result_path} \
-        --ratio ${ratio} 2>&1)
-    result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}')
-
-    echo "Evaluation results saved to ${result_file}"
-    zip -r "${result_path}.zip" ${result_path}
-    python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack"
+    #echo "Start calculating final score..."
+    #cd ${dataset_location}
+    #output=$(python evaluate_i2v.py \
+    #    --videos_path ${output_video_path} \
+    #    --dimension ${dimension_list} \
+    #    --output_path ${result_path} \
+    #    --ratio ${ratio} 2>&1)
+    #result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}')
+
+    #echo "Evaluation results saved to ${result_file}"
+    #zip -r "${result_path}.zip" ${result_path}
+    #python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack"
 
 }
 

From c657c0b86d69692780b48f736a1c608d5a48f353 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 21 Nov 2025 16:34:05 +0800
Subject: [PATCH 5/6] Update run_benchmark.sh

---
 .../diffusion_model/diffusers/framepack/run_benchmark.sh      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
index d22d03be70c..8b090297ac8 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
@@ -80,10 +80,10 @@ function run_benchmark {
         dimension_num=${#dimensions[@]}
         if [ "${visible_gpus}" -gt "${dimension_num}" ]; then
             count=${dimension_num}
-	    step=1
+	        step=1
         else
             count=${visible_gpus}
-	    step=$((dimension_num/visible_gpus))
+	        step=$((dimension_num/visible_gpus))
             left=${dimensions[@]:step*count-1:dimension_num}
             dimensions=("${dimensions[@]:0:step*count-1}" "$left")
         fi

From a231251ba74d3ddf55ae9b6027f42da4998140ab Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 21 Nov 2025 16:35:01 +0800
Subject: [PATCH 6/6] Update run_benchmark.sh

---
 .../diffusers/framepack/run_benchmark.sh      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
index 8b090297ac8..71dc61b38d9 100644
--- a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh
@@ -111,18 +111,18 @@ function run_benchmark {
             ${extra_cmd}
     fi
 
-    #echo "Start calculating final score..."
-    #cd ${dataset_location}
-    #output=$(python evaluate_i2v.py \
-    #    --videos_path ${output_video_path} \
-    #    --dimension ${dimension_list} \
-    #    --output_path ${result_path} \
-    #    --ratio ${ratio} 2>&1)
-    #result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}')
-
-    #echo "Evaluation results saved to ${result_file}"
-    #zip -r "${result_path}.zip" ${result_path}
-    #python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack"
+    echo "Start calculating final score..."
+    cd ${dataset_location}
+    output=$(python evaluate_i2v.py \
+        --videos_path ${output_video_path} \
+        --dimension ${dimension_list} \
+        --output_path ${result_path} \
+        --ratio ${ratio} 2>&1)
+    result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}')
+
+    echo "Evaluation results saved to ${result_file}"
+    zip -r "${result_path}.zip" ${result_path}
+    python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack"
 
 }