Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Step-by-Step

This example quantizes and validates the accuracy of Llama4.

# Prerequisite

## 1. Environment

```shell
docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
docker exec -it llama4 bash
git clone https://github.com/intel/neural-compressor.git
cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4
bash setup.sh
```

## 2. Prepare Model

```shell
hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-17B-16E-Instruct
```

# Run

## 1. Quantization

```bash
CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/
```


## 2. Benchmark

```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
auto-round @ git+https://github.com/intel/auto-round@v0.8.0rc
lm-eval==0.4.9.1
setuptools_scm
torchao==0.12.0
triton==3.3.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
set -x

function main {
init_params "$@"
run_benchmark

}

# init params
function init_params {
for var in "$@"
do
case $var in
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--tasks=*)
tasks=$(echo $var |cut -f2 -d=)
;;
--tp_size=*)
tp_size=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
esac
done

}

# run_benchmark
function run_benchmark {

extra_model_args=""
extra_cmd=""
batch_size=${batch_size:=1}

if [ "${topology}" = "llama4_mxfp4" ]; then
extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
extra_cmd="--gen_kwargs max_gen_toks=2048"
fi

if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then
model="vllm-vlm"
extra_cmd=${extra_cmd}" --apply_chat_template"
else
model="vllm"
fi

NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
lm_eval --model ${model} \
--model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
--tasks ${tasks} \
--batch_size ${batch_size} \
${extra_cmd}
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_tuning

}

# init params
function init_params {
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--output_model=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}

# run_tuning
function run_tuning {
extra_cmd=""
tuned_checkpoint=${tuned_checkpoint:="saved_results"}
iters=${iters:=0}

if [ "${topology}" = "llama4_mxfp4" ]; then
extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
fi

python3 -m auto_round \
--model ${input_model} \
--iters ${iters} \
--format "llm_compressor" \
--output_dir ${tuned_checkpoint} \
${extra_cmd}
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pip install -r requirements.txt
pip install setuptools --upgrade
pip install packaging --upgrade
pip install -U "huggingface_hub[cli]"
git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
cd vllm-fork
VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
cd ..