From 54ae4a05356088b777cfa5e059735c58dd9a44ae Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Sun, 28 Sep 2025 23:14:12 -0400 Subject: [PATCH 01/10] add inital files Signed-off-by: Mengni Wang --- .../auto_round/llama4/requirements.txt | 3 + .../auto_round/llama4/run_benchmark.sh | 57 +++++++++++++++++++ .../auto_round/llama4/run_quant.sh | 57 +++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt new file mode 100644 index 00000000000..22c98c95389 --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt @@ -0,0 +1,3 @@ +auto-tound +compressed-tensors +lm-eval \ No newline at end of file diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh new file mode 100644 index 00000000000..085cda09b2b --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --tasks=*) + tasks=$(echo $var |cut -f2 -d=) + ;; + --tp_size=*) + tp_size=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + + extra_model_args="" + + if [ "${topology}" = "llama4_mxfp4" ]; then + extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto" + fi + + if [ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]; then + model="vllm-vlm" + else + model="vllm" + fi + + VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \ + lm_eval --model ${model} \ + --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \ + --tasks ${tasks} \ + --batch_size ${batch_size} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh new file mode 100644 index 00000000000..5b484230db6 --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd="" + tuned_checkpoint=${tuned_checkpoint:="saved_results"} + + if [ "${topology}" = "llama4_mxfp4" ]; then + extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4" + fi + + python3 -m auto_round \ + --model ${input_model} \ + --iters ${iters} \ + --format "llm_compressor" \ + --output_dir ${tuned_checkpoint} + ${extra_cmd} +} + +main "$@" From dae6bc8bc23dce4e93c023dfc71daa70f57acf1d Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Mon, 29 Sep 2025 03:06:40 -0400 Subject: [PATCH 02/10] update example Signed-off-by: Mengni Wang --- .../quantization/auto_round/llama4/README.md | 36 +++++++++++++++++++ .../auto_round/llama4/requirements.txt | 7 ++-- .../auto_round/llama4/run_benchmark.sh | 10 ++++-- .../auto_round/llama4/run_quant.sh | 3 +- .../quantization/auto_round/llama4/setup.sh | 8 +++++ 5 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md new file mode 100644 index 00000000000..b7a1f5272b9 --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -0,0 +1,36 @@ +# Step-by-Step + +This example quantizes and validates the accuracy of Llama4. + +# Prerequisite + +## 1. Environment + +```shell +docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash +docker exec -it llama4 bash +git clone https://github.com/intel/neural-compressor.git +cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4 +bash setup.sh +``` + +## 2. Prepare Model + +```shell +hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-17B-16E-Instruct +``` + +# Run + +## 1. Quantization + +```bash +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=/huggingface/Llama-4-Scout-17B-16E-Instruct/ --iters=0 +``` + + +## 2. Benchmark + +```bash +NCCL_NVLS_ENABLE=0 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4 +``` diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt index 22c98c95389..99ed1242520 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt @@ -1,3 +1,6 @@ -auto-tound +auto-tound==0.8.0 compressed-tensors -lm-eval \ No newline at end of file +lm-eval +setuptools_scm +torchao==0.12.0 +triton==3.3.1 \ No newline at end of file diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh index 085cda09b2b..b4ca1890a73 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -36,13 +36,16 @@ function init_params { function run_benchmark { extra_model_args="" + extra_cmd="" if [ "${topology}" = "llama4_mxfp4" ]; then - extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto" + extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7" + extra_cmd="--gen_kwargs max_gen_toks=2048" fi - if [ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]; then + if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then model="vllm-vlm" + extra_cmd=${extra_cmd}" --apply_chat_template" else model="vllm" fi @@ -51,7 +54,8 @@ function run_benchmark { lm_eval --model ${model} \ --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \ --tasks ${tasks} \ - --batch_size ${batch_size} + --batch_size ${batch_size} \ + ${extra_cmd} } main "$@" diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh index 5b484230db6..68ab161d8a5 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh @@ -41,6 +41,7 @@ function init_params { function run_tuning { extra_cmd="" tuned_checkpoint=${tuned_checkpoint:="saved_results"} + iters=${iters:=0} if [ "${topology}" = "llama4_mxfp4" ]; then extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4" @@ -50,7 +51,7 @@ function run_tuning { --model ${input_model} \ --iters ${iters} \ --format "llm_compressor" \ - --output_dir ${tuned_checkpoint} + --output_dir ${tuned_checkpoint} \ ${extra_cmd} } diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh new file mode 100644 index 00000000000..f703df8e9d8 --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh @@ -0,0 +1,8 @@ +pip install -r requirements.txt +pip install setuptools --upgrade +pip install packaging -- upgrade +pip install -U "huggingface_hub[cli]" +git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git +cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation +cd .. \ No newline at end of file From 7ec80a3be996ed08e9a3b1eee96ce920240e40d2 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 15:07:42 +0800 Subject: [PATCH 03/10] Update README.md --- .../quantization/auto_round/llama4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index b7a1f5272b9..455fb8a95b6 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -25,7 +25,7 @@ hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout- ## 1. Quantization ```bash -CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=/huggingface/Llama-4-Scout-17B-16E-Instruct/ --iters=0 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/ --iters=0 ``` From 06dff96602509038ecead65af0ae4be8b53f8052 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 15:09:24 +0800 Subject: [PATCH 04/10] Update requirements.txt --- .../quantization/auto_round/llama4/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt index 99ed1242520..e250fd9177b 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt @@ -1,6 +1,5 @@ auto-tound==0.8.0 -compressed-tensors lm-eval setuptools_scm torchao==0.12.0 -triton==3.3.1 \ No newline at end of file +triton==3.3.1 From 89d6a93df6a3c1d8e78a9d5540394a69dab25704 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 15:10:27 +0800 Subject: [PATCH 05/10] Update run_benchmark.sh --- .../quantization/auto_round/llama4/run_benchmark.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh index b4ca1890a73..7ce4a0d972d 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -37,6 +37,7 @@ function run_benchmark { extra_model_args="" extra_cmd="" + batch_size=${batch_size:=1} if [ "${topology}" = "llama4_mxfp4" ]; then extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7" From 65863322a975128e6a34cad04b2d0490e6b57e8f Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 15:17:44 +0800 Subject: [PATCH 06/10] Update setup.sh --- .../quantization/auto_round/llama4/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh index f703df8e9d8..629d056eba3 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh @@ -1,8 +1,8 @@ pip install -r requirements.txt pip install setuptools --upgrade -pip install packaging -- upgrade +pip install packaging --upgrade pip install -U "huggingface_hub[cli]" git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation -cd .. \ No newline at end of file +cd .. From cd5e8791a087a19b11f342e1f6f2fa702d517355 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 15:21:39 +0800 Subject: [PATCH 07/10] Update README.md --- .../quantization/auto_round/llama4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index 455fb8a95b6..1db2c876fc4 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -25,7 +25,7 @@ hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout- ## 1. Quantization ```bash -CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/ --iters=0 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/ ``` From ef309a4adc816840bb1b325e4d8ce30ceafcc7b3 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 16:45:45 +0800 Subject: [PATCH 08/10] Update requirements.txt --- .../quantization/auto_round/llama4/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt index e250fd9177b..1890807a54f 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt @@ -1,5 +1,5 @@ -auto-tound==0.8.0 -lm-eval +auto-round @ git+https://github.com/intel/auto-round@v0.8.0rc +lm-eval==0.4.9.1 setuptools_scm torchao==0.12.0 triton==3.3.1 From 68bd5e1d6792faaaa079a54d971c530c4e463627 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 16:55:38 +0800 Subject: [PATCH 09/10] Update run_benchmark.sh --- .../quantization/auto_round/llama4/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh index 7ce4a0d972d..0019f164bd7 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -51,7 +51,7 @@ function run_benchmark { model="vllm" fi - VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \ + NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \ lm_eval --model ${model} \ --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \ --tasks ${tasks} \ From 96b0837cd8195406d6844544cf8012eeceedf257 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 29 Sep 2025 16:55:59 +0800 Subject: [PATCH 10/10] Update README.md --- .../quantization/auto_round/llama4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index 1db2c876fc4..a01933f0500 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -32,5 +32,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L ## 2. Benchmark ```bash -NCCL_NVLS_ENABLE=0 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4 +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4 ```