From 54ae4a05356088b777cfa5e059735c58dd9a44ae Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@nitel.com>
Date: Sun, 28 Sep 2025 23:14:12 -0400
Subject: [PATCH 01/10] add inital files

Signed-off-by: Mengni Wang <mengni.wang@nitel.com>
---
 .../auto_round/llama4/requirements.txt        |  3 +
 .../auto_round/llama4/run_benchmark.sh        | 57 +++++++++++++++++++
 .../auto_round/llama4/run_quant.sh            | 57 +++++++++++++++++++
 3 files changed, 117 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
new file mode 100644
index 00000000000..22c98c95389
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -0,0 +1,3 @@
+auto-tound
+compressed-tensors
+lm-eval
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
new file mode 100644
index 00000000000..085cda09b2b
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --tasks=*)
+          tasks=$(echo $var |cut -f2 -d=)
+      ;;
+      --tp_size=*)
+          tp_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+    extra_model_args=""
+
+    if [ "${topology}" = "llama4_mxfp4" ]; then
+        extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto"
+    fi
+
+    if [ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]; then
+        model="vllm-vlm"
+    else
+        model="vllm"
+    fi
+
+    VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    lm_eval --model ${model} \
+            --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
+            --tasks ${tasks} \
+            --batch_size ${batch_size}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
new file mode 100644
index 00000000000..5b484230db6
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=""
+    tuned_checkpoint=${tuned_checkpoint:="saved_results"}
+
+    if [ "${topology}" = "llama4_mxfp4" ]; then
+        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
+    fi
+
+    python3 -m auto_round \
+        --model ${input_model} \
+        --iters ${iters}  \
+        --format "llm_compressor"  \
+        --output_dir ${tuned_checkpoint}
+        ${extra_cmd}
+}
+
+main "$@"

From dae6bc8bc23dce4e93c023dfc71daa70f57acf1d Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@nitel.com>
Date: Mon, 29 Sep 2025 03:06:40 -0400
Subject: [PATCH 02/10] update example

Signed-off-by: Mengni Wang <mengni.wang@nitel.com>
---
 .../quantization/auto_round/llama4/README.md  | 36 +++++++++++++++++++
 .../auto_round/llama4/requirements.txt        |  7 ++--
 .../auto_round/llama4/run_benchmark.sh        | 10 ++++--
 .../auto_round/llama4/run_quant.sh            |  3 +-
 .../quantization/auto_round/llama4/setup.sh   |  8 +++++
 5 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
new file mode 100644
index 00000000000..b7a1f5272b9
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -0,0 +1,36 @@
+# Step-by-Step
+
+This example quantizes and validates the accuracy of Llama4.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
+docker exec -it llama4 bash
+git clone https://github.com/intel/neural-compressor.git
+cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4
+bash setup.sh
+```
+
+## 2. Prepare Model
+
+```shell
+hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-17B-16E-Instruct
+```
+
+# Run
+
+## 1. Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=/huggingface/Llama-4-Scout-17B-16E-Instruct/ --iters=0
+```
+
+
+## 2. Benchmark
+
+```bash
+NCCL_NVLS_ENABLE=0 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
+```
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
index 22c98c95389..99ed1242520 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -1,3 +1,6 @@
-auto-tound
+auto-tound==0.8.0
 compressed-tensors
-lm-eval
\ No newline at end of file
+lm-eval
+setuptools_scm
+torchao==0.12.0
+triton==3.3.1
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
index 085cda09b2b..b4ca1890a73 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -36,13 +36,16 @@ function init_params {
 function run_benchmark {
 
     extra_model_args=""
+    extra_cmd=""
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
-        extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto"
+        extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
+        extra_cmd="--gen_kwargs max_gen_toks=2048"
     fi
 
-    if [ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]; then
+    if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then
         model="vllm-vlm"
+        extra_cmd=${extra_cmd}" --apply_chat_template"
     else
         model="vllm"
     fi
@@ -51,7 +54,8 @@ function run_benchmark {
     lm_eval --model ${model} \
             --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
             --tasks ${tasks} \
-            --batch_size ${batch_size}
+            --batch_size ${batch_size} \
+            ${extra_cmd}
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
index 5b484230db6..68ab161d8a5 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -41,6 +41,7 @@ function init_params {
 function run_tuning {
     extra_cmd=""
     tuned_checkpoint=${tuned_checkpoint:="saved_results"}
+    iters=${iters:=0}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
         extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
@@ -50,7 +51,7 @@ function run_tuning {
         --model ${input_model} \
         --iters ${iters}  \
         --format "llm_compressor"  \
-        --output_dir ${tuned_checkpoint}
+        --output_dir ${tuned_checkpoint} \
         ${extra_cmd}
 }
 
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
new file mode 100644
index 00000000000..f703df8e9d8
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -0,0 +1,8 @@
+pip install -r requirements.txt
+pip install setuptools --upgrade
+pip install packaging -- upgrade
+pip install -U "huggingface_hub[cli]"
+git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
+cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
+cd ..
\ No newline at end of file

From 7ec80a3be996ed08e9a3b1eee96ce920240e40d2 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 15:07:42 +0800
Subject: [PATCH 03/10] Update README.md

---
 .../quantization/auto_round/llama4/README.md                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index b7a1f5272b9..455fb8a95b6 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -25,7 +25,7 @@ hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-
 ## 1. Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=/huggingface/Llama-4-Scout-17B-16E-Instruct/ --iters=0
+CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/ --iters=0
 ```
 
 

From 06dff96602509038ecead65af0ae4be8b53f8052 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 15:09:24 +0800
Subject: [PATCH 04/10] Update requirements.txt

---
 .../quantization/auto_round/llama4/requirements.txt            | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
index 99ed1242520..e250fd9177b 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -1,6 +1,5 @@
 auto-tound==0.8.0
-compressed-tensors
 lm-eval
 setuptools_scm
 torchao==0.12.0
-triton==3.3.1
\ No newline at end of file
+triton==3.3.1

From 89d6a93df6a3c1d8e78a9d5540394a69dab25704 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 15:10:27 +0800
Subject: [PATCH 05/10] Update run_benchmark.sh

---
 .../quantization/auto_round/llama4/run_benchmark.sh              | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
index b4ca1890a73..7ce4a0d972d 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -37,6 +37,7 @@ function run_benchmark {
 
     extra_model_args=""
     extra_cmd=""
+    batch_size=${batch_size:=1}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
         extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"

From 65863322a975128e6a34cad04b2d0490e6b57e8f Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 15:17:44 +0800
Subject: [PATCH 06/10] Update setup.sh

---
 .../quantization/auto_round/llama4/setup.sh                   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
index f703df8e9d8..629d056eba3 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -1,8 +1,8 @@
 pip install -r requirements.txt
 pip install setuptools --upgrade
-pip install packaging -- upgrade
+pip install packaging --upgrade
 pip install -U "huggingface_hub[cli]"
 git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
 cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
-cd ..
\ No newline at end of file
+cd ..

From cd5e8791a087a19b11f342e1f6f2fa702d517355 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 15:21:39 +0800
Subject: [PATCH 07/10] Update README.md

---
 .../quantization/auto_round/llama4/README.md                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index 455fb8a95b6..1db2c876fc4 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -25,7 +25,7 @@ hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-
 ## 1. Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/ --iters=0
+CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/
 ```
 
 

From ef309a4adc816840bb1b325e4d8ce30ceafcc7b3 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 16:45:45 +0800
Subject: [PATCH 08/10] Update requirements.txt

---
 .../quantization/auto_round/llama4/requirements.txt           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
index e250fd9177b..1890807a54f 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -1,5 +1,5 @@
-auto-tound==0.8.0
-lm-eval
+auto-round @ git+https://github.com/intel/auto-round@v0.8.0rc
+lm-eval==0.4.9.1
 setuptools_scm
 torchao==0.12.0
 triton==3.3.1

From 68bd5e1d6792faaaa079a54d971c530c4e463627 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 16:55:38 +0800
Subject: [PATCH 09/10] Update run_benchmark.sh

---
 .../quantization/auto_round/llama4/run_benchmark.sh             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
index 7ce4a0d972d..0019f164bd7 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -51,7 +51,7 @@ function run_benchmark {
         model="vllm"
     fi
 
-    VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
     lm_eval --model ${model} \
             --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
             --tasks ${tasks} \

From 96b0837cd8195406d6844544cf8012eeceedf257 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 29 Sep 2025 16:55:59 +0800
Subject: [PATCH 10/10] Update README.md

---
 .../quantization/auto_round/llama4/README.md                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index 1db2c876fc4..a01933f0500 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -32,5 +32,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L
 ## 2. Benchmark
 
 ```bash
-NCCL_NVLS_ENABLE=0 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
 ```