intel · chensuyue · Sep 30, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/...es/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/...es/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -0,0 +1,36 @@
+# Step-by-Step
+
+This example quantizes and validates the accuracy of Llama4.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
+docker exec -it llama4 bash
+git clone https://github.com/intel/neural-compressor.git
+cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4
+bash setup.sh
+```
+
+## 2. Prepare Model
+
+```shell
+hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-17B-16E-Instruct
+```
+
+# Run
+
+## 1. Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/
+```
+
+
+## 2. Benchmark
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
+```
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -0,0 +1,5 @@
+auto-round @ git+https://github.com/intel/auto-round@v0.8.0rc
+lm-eval==0.4.9.1
+setuptools_scm
+torchao==0.12.0
+triton==3.3.1
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --tasks=*)
+          tasks=$(echo $var |cut -f2 -d=)
+      ;;
+      --tp_size=*)
+          tp_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+    extra_model_args=""
+    extra_cmd=""
+    batch_size=${batch_size:=1}
+
+    if [ "${topology}" = "llama4_mxfp4" ]; then
+        extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
+        extra_cmd="--gen_kwargs max_gen_toks=2048"
+    fi
+
+    if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then
+        model="vllm-vlm"
+        extra_cmd=${extra_cmd}" --apply_chat_template"
+    else
+        model="vllm"
+    fi
+
+    NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    lm_eval --model ${model} \
+            --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
+            --tasks ${tasks} \
+            --batch_size ${batch_size} \
+            ${extra_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=""
+    tuned_checkpoint=${tuned_checkpoint:="saved_results"}
+    iters=${iters:=0}
+
+    if [ "${topology}" = "llama4_mxfp4" ]; then
+        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
+    fi
+
+    python3 -m auto_round \
+        --model ${input_model} \
+        --iters ${iters}  \
+        --format "llm_compressor"  \
+        --output_dir ${tuned_checkpoint} \
+        ${extra_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -0,0 +1,8 @@
+pip install -r requirements.txt
+pip install setuptools --upgrade
+pip install packaging --upgrade
+pip install -U "huggingface_hub[cli]"
+git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
+cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
+cd ..