From 2e50295b78c171ed4cb23c1b895af9f19a5d7daf Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 25 Nov 2025 03:32:40 -0500 Subject: [PATCH 01/10] autotune target_bits example for llama recipe Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 217 + .../auto_round/llama3/quantize.py | 260 + .../llama3}/requirements.txt | 0 .../quantization/mix-precision/README.md | 125 - .../quantization/mix-precision/quantize.py | 261 - .../Meta-Llama-3.1-8B-Instruct_7bits.json | 2242 ------- .../Meta-Llama-3.3-70B-Instruct_5bits.json | 5602 ----------------- .../quantization/mix-precision/run_hf_inf.py | 29 - neural_compressor/common/base_config.py | 3 +- .../torch/algorithms/weight_only/autoround.py | 17 +- .../torch/quantization/config.py | 178 +- .../torch/utils/auto_accelerator.py | 3 +- 12 files changed, 508 insertions(+), 8429 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py rename examples/pytorch/nlp/huggingface_models/language-modeling/quantization/{mix-precision => auto_round/llama3}/requirements.txt (100%) delete mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md delete mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py delete mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.1-8B-Instruct_7bits.json delete mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.3-70B-Instruct_5bits.json delete mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/run_hf_inf.py diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md new file mode 100644 index 00000000000..d33d8090ed3 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -0,0 +1,217 @@ +# Step-by-step + +In this example, you can verify the accuracy on HPU/CUDA device with emulation of MXFP4, MXFP8, NVFP4 and uNVFP4. + +## Requirement + +```bash +# neural-compressor-pt +pip install neural-compressor-pt>=3.6 +# auto-round +pip install auto-round>=0.8.0 +# other requirements +pip install -r requirements.txt +``` + +## Quantization + +### Demo (`MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`) + +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path facebook/opt-125m \ + --quantize \ + --dtype MXFP8 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path OPT-125M-MXFP8 \ + --accuracy \ + --tasks lambada_openai \ + --eval_batch_size 8 +``` + +Notes: +- Use `--export_format auto_round` for `MXFP4`, `MXFP8` data type and do inference as [below](#mxfp4--mxfp8) +- Use `--export_format llm_compressor` for `NVFP4` data type since public vLLM supports it. +- Use `--export_format fake` for `uNVFP4` data type since it's not fully supported. +- Setting `--quant_lm_head` applies `--dtype` for the lm_head layer. +- Setting `--iters 0` skips AutoRound tuning and uses RTN method. + + +#### Target_bits + +To achieve optimal compression ratios in mixed-precision quantization, we provide the `--target_bits` argument for automated precision configuration. + +- If you pass a single float number, it will automatically generate an optimal quantization recipe to achieve that target average bit-width. +- If you pass multiple float numbers, it will generate multiple recipes for different target bit-widths, allowing you to compare trade-offs between model size and accuracy. + +Example usage: + +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path facebook/opt-125m \ + --quantize \ + --dtype MXFP4 \ + --target_bits 6.5 7 7.3 \ + --tune_limit 100 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path OPT-125m-MXFP4-MXFP8 \ + --accuracy \ + --tasks lambada_openai \ + --eval_batch_size 8 +``` + + +### Llama3 Quantization Recipes + +#### Llama 3.1 8B MXFP8 + +AutoRound helps improve the accuracy, `iters` and `nsamples` is higher than default. +```bash +# Quantize and export AutoRound format +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path /models/Meta-Llama-3.1-8B-Instruct \ + --quantize \ + --dtype MXFP8 \ + --iters 1000 \ + --nsamples 512 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path Llama-3.1-8B-MXFP8 +``` + + +#### Llama 3.1 8B MXFP4 (Mixed with MXFP8) + +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path /models/Meta-Llama-3.1-8B-Instruct \ + --quantize \ + --target_bits 7.8 \ + --options "MXFP4" "MXFP8" \ + --shared_layer "k_proj" "v_proj" "q_proj" \ + --shared_layer "gate_proj" "up_proj" \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path Llama-3.1-8B-MXFP4-MXFP8 +``` + +#### Llama 3.3 70B MXFP8 + +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path /models/Llama-3.3-70B-Instruct/ \ + --quantize \ + --dtype MXFP8 \ + --quant_lm_head \ + --iters 0 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path Llama-3.3-70B-MXFP8 +``` + +#### Llama 3.3 70B MXFP4 (Mixed with MXFP8) +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path /models/Llama-3.3-70B-Instruct/ \ + --quantize \ + --target_bits 5.8 \ + --options "MXFP4" "MXFP8" \ + --shared_layer "k_proj" "v_proj" "q_proj" \ + --shared_layer "gate_proj" "up_proj" \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format auto_round \ + --export_path Llama-3.3-70B-MXFP4-MXFP8 +``` + +#### Llama 3.1 70B uNVFP4 + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python quantize.py \ + --model_name_or_path /models/Llama-3.1-70B-Instruct/ \ + --quantize \ + --dtype uNVFP4 \ + --quant_lm_head \ + --iters 0 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format fake \ + --export_path Llama-3.1-70B-uNVFP4 \ + --accuracy +``` + +Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES` or reducing `eval_batch_size` is suggested. + +## Inference + +### MXFP4 / MXFP8 + +- Both pure MXFP4/MXFP8 and mix-precision model generated by target bits are supported. + +#### Prerequisite + +```bash +# Install the forked vLLM +git clone https://github.com/yiliu30/vllm-fork.git +cd vllm-fork +git checkout fused-moe-ar +VLLM_USE_PRECOMPILED=1 pip install -e . +``` + +#### Accuracy Evaluation +```bash +# add_bos_token=True helps accuracy for general tasks +VLLM_ENABLE_AR_EXT=1 \ +TORCH_COMPILE_DISABLE=1 \ +CUDA_VISIBLE_DEVICES=0 \ +lm_eval --model vllm \ + --model_args pretrained=Llama-3.1-8B-MXFP4-MXFP8,add_bos_token=True,tensor_parallel_size=1,data_parallel_size=1 \ + --tasks piqa,hellaswag,mmlu \ + --batch_size 8 & +wait +# add_bos_token=True helps accuracy for GSM8K +VLLM_ENABLE_AR_EXT=1 \ +TORCH_COMPILE_DISABLE=1 \ +CUDA_VISIBLE_DEVICES=0 \ +lm_eval --model vllm \ + --model_args pretrained=Llama-3.1-8B-MXFP4-MXFP8,add_bos_token=False,tensor_parallel_size=1,data_parallel_size=1 \ + --tasks gsm8k \ + --batch_size 8 +``` + +Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES`+`tensor_parallel_size` or reducing `batch_size` is suggested. + +### NVFP4 +NVFP4 is supported by vLLM already, please set `llm_compressor` format for exporting during quantization. + +```bash +CUDA_VISIBLE_DEVICES=0 lm_eval --model vllm \ + --model_args pretrained={nvfp4_model_path},tensor_parallel_size=1,data_parallel_size=1 \ + --tasks lambada_openai \ + --batch_size 4 +``` + +### uNVFP4 +uNVFP4 is saved in fake format and reloading is not available currently. To verify accuracy after quantization, setting `--accuracy --tasks lambada_openai` in command. + +```bash +CUDA_VISIBLE_DEVICES=0 python quantize.py \ + --model_name_or_path facebook/opt-125m \ + --quantize \ + --dtype uNVFP4 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format fake \ + --export_path OPT-125M-uNVFP4 \ + --accuracy \ + --tasks lambada_openai \ + --eval_batch_size 8 \ + --device_map 0 +``` diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py new file mode 100644 index 00000000000..39d51edfc8f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -0,0 +1,260 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import torch +import transformers + +# For reproducibility +torch.manual_seed(42) +torch.use_deterministic_algorithms(True, warn_only=True) +######################## HPU Memory Optimization ########################### +# ensure that unnecessary memory is released during quantization. +os.environ.setdefault("PT_HPU_LAZY_MODE", "1") +os.environ.setdefault("PT_HPU_WEIGHT_SHARING", "0") +if int(os.getenv("WORLD_SIZE", "0")) > 0: + os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") + os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") +from neural_compressor.torch.utils import is_hpex_available, world_size +from neural_compressor.torch.quantization import autotune, prepare, convert, AutoRoundConfig, TuningConfig + +if is_hpex_available(): + import habana_frameworks.torch.core as htcore + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + htcore.hpu_set_env() +############################################################################ + + +def initialize_model_and_tokenizer(model_name_or_path): + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) + config = transformers.AutoConfig.from_pretrained(model_name_or_path) + # using memory mapping with torch_dtype=config.torch_dtype + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=config.torch_dtype) + model.eval() + return model, tokenizer + + +def dispatch_model_on_devices(model): + from accelerate.big_modeling import dispatch_model, infer_auto_device_map + from accelerate.utils import get_max_memory, get_balanced_memory + + no_split_modules = getattr(model, "_no_split_modules", []) + balanced_memory = get_balanced_memory(model) # to initialize the function cache + auto_device_map = infer_auto_device_map( + model, + max_memory=balanced_memory, + no_split_module_classes=no_split_modules + ) + print(auto_device_map) + model = dispatch_model(model, auto_device_map) + return model + + +@torch.no_grad() +def get_accuracy(model_name_or_path, tokenizer=None, limit=None): + os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + all_accuracy = {} + test_gsm8k = False + test_normal = False + if "gsm8k" in args.tasks: + test_gsm8k = True + args.tasks.remove("gsm8k") + if args.tasks: + test_normal = True + import lm_eval + from lm_eval.models.huggingface import HFLM + + ########################## gms8k (ahead of normal tasks) ######################### + if test_gsm8k: + lm = HFLM( + pretrained=model_name_or_path, + tokenizer=tokenizer, + add_bos_token=False, + batch_size=args.eval_batch_size, + ) + results_gsm8k = lm_eval.simple_evaluate( + lm, + tasks=["gsm8k"], + limit=args.limit if limit is None else limit, + ) + for task_name, task_results in results_gsm8k["results"].items(): + accu = task_results["exact_match,strict-match"] + all_accuracy[task_name] = accu + ########################## gms8k end ######################### + if test_normal: + lm = HFLM( + pretrained=model_name_or_path, + tokenizer=tokenizer, + add_bos_token=True, + batch_size=args.eval_batch_size, + ) + results = lm_eval.simple_evaluate( + lm, + tasks=args.tasks, + limit=args.limit if limit is None else limit, + ) + for task_name, task_results in results["results"].items(): + if "acc,none" in task_results: + accu = task_results["acc,none"] + all_accuracy[task_name] = accu + for task_name, accu in all_accuracy.items(): + print(f"Accuracy for {task_name}: {accu:.4f}") + avg_accu = sum(all_accuracy.values())/len(all_accuracy) + print(f"Overall accuracy: {avg_accu:.4f}") + return avg_accu + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Habana FP8 quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--model_name_or_path", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct", help="model name or path" + ) + parser.add_argument("--dtype", type=str, default="MXFP4", choices=["MXFP4", "MXFP8", "NVFP4", "NVFP4+", "uNVFP4"], help="data type") + parser.add_argument("--quantize", action="store_true", help="whether to quantize model") + parser.add_argument("--device_map", type=str, default="auto", help="device map for model") + parser.add_argument( + "--target_bits", + type=float, + nargs="+", + default=None, + help="target bits for mix precision" + ) + parser.add_argument("--tolerable_loss", type=float, default=0.01, + help="tolerable loss for accuracy autotune, relative value to the fp32 baseline") + parser.add_argument( + "--options", + type=str, + nargs="+", + default=[ + "MXFP4", + "MXFP8", + ], + help="options for mix precision" + ) + parser.add_argument( + "--shared_layer", + type=str, + nargs="+", + action='append', + default=[], + help="[mix-precision] ensure that listed layers are using same data type for quantization" + ) + parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model") + parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file") + parser.add_argument("--iters", default=200, type=int, help="iters for autoround.") + parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.") + parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.") + parser.add_argument("--save", action="store_true", help="whether to save the quantized model") + parser.add_argument("--export_path", type=str, default="saved_results", help="path to save the quantized model") + parser.add_argument("--export_format", type=str, default="auto_round", help="format to save the quantized model") + parser.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch.compile") + parser.add_argument("--low_gpu_mem_usage", action="store_true", help="whether to enable low_gpu_mem_usage") + parser.add_argument("--quant_lm_head", action="store_true", help="whether to quantize lm_head") + parser.add_argument("--accuracy", action="store_true", help="accuracy measurement") + parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.") + parser.add_argument("--eval_batch_size", default=16, type=int, help="batch size for accuracy evaluation.") + parser.add_argument( + "--tasks", + type=str, + nargs="+", + default=[ + "piqa", + "hellaswag", + "mmlu", + "gsm8k", + ], + help="tasks for accuracy validation, text-generation and code-generation tasks are different.", + ) + parser.add_argument("--limit", type=int, default=None, help="number of samples for accuracy evaluation") + parser.add_argument("--tune_limit", type=int, default=100, help="number of samples for accuracy autotune") + args = parser.parse_args() + + if args.target_bits is None: + print("Target data type:", args.dtype) + else: + print("Target data type for mix precision:", args.options) + print("Layers sharing the same data type:", args.shared_layer) + model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path) + + if args.quantize: + if args.dtype in ["uNVFP4", "NVFP4+"]: + from auto_round.schemes import QuantizationScheme + + uNVFP4 = QuantizationScheme.from_dict( + { + "bits": 4, + "group_size": 16, + "data_type": "fp4_v2", + "act_bits": 4, + "act_data_type": "fp4_v2", + "act_group_size": 16, + "act_sym": True, + } + ) + args.dtype = uNVFP4 + + layer_config = {} + if args.use_recipe: + ############ load recipe results (MXFP4 + MXFP8) ############ + def load_recipe_results(file_path): + import json + with open(file_path, "r") as f: + return json.load(f) + + layer_config = load_recipe_results(args.recipe_file) + if args.quant_lm_head: + # ensure lm_head is quantized with mxfp8_config + layer_config.update({"lm_head": args.dtype}) + + # preprocess + if isinstance(args.target_bits, list) and len(args.target_bits) == 1: + args.target_bits = args.target_bits[0] + config = AutoRoundConfig( + tokenizer=tokenizer, + iters=args.iters, + seqlen=args.seqlen, + nsamples=args.nsamples, + scheme=args.dtype, + target_bits=args.target_bits, + options=args.options, + shared_layers=args.shared_layer, + enable_torch_compile=args.enable_torch_compile, + low_gpu_mem_usage=args.low_gpu_mem_usage, + export_format=args.export_format, + output_dir=args.export_path, + device_map=args.device_map, + layer_config=layer_config if (args.use_recipe or args.quant_lm_head) else None, + ) + if isinstance(args.target_bits, list) and len(args.target_bits) > 1: + def eval_fn(model): + model = model.eval() + model = dispatch_model_on_devices(model) + accu = get_accuracy(model, tokenizer, args.tune_limit) + model = model.to("cpu") + return accu + tuning_config = TuningConfig(config_set=[config], tolerable_loss=args.tolerable_loss) + model = autotune(model, tuning_config, eval_fn=eval_fn) + else: + model = prepare(model, config) + model = convert(model) + print(f"Quantized model in {args.export_format} format is saved to {args.export_path}") + + if args.accuracy: + model = dispatch_model_on_devices(model) + get_accuracy(model, tokenizer) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt similarity index 100% rename from examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/requirements.txt rename to examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md deleted file mode 100644 index 52d043474d1..00000000000 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# Run - -In this example, you can verify the accuracy on HPU/CUDA device with emulation of MXFP4, MXFP8, NVFP4 and uNVFP4. - -## Requirement - -```bash -# neural-compressor-pt -pip install neural-compressor-pt==3.6 -# auto-round -pip install auto-round==0.8.0 -# other requirements -pip install -r requirements.txt -``` -**Before neural-compressor v3.6 and auto-round v0.8.0 release, please install from source for the latest updates:** -```bash -# neural-compressor-pt -INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc -# auto-round -pip install git+https://github.com/intel/auto-round.git@v0.8.0rc -# other requirements -pip install -r requirements.txt -``` - -## Quantization - -### Demo (`MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`) - -```bash -python quantize.py --model_name_or_path facebook/opt-125m --quantize --dtype MXFP4 --batch_size 8 --accuracy --enable_torch_compile -``` - -### Mix-precision Quantization (`MXFP4 + MXFP8`) - -```bash -# Llama 3.1 8B -python quantize.py \ - --model_name_or_path meta-llama/Llama-3.1-8B-Instruct \ - --quantize \ - --dtype MXFP4 \ - --use_recipe \ - --recipe_file recipes/Meta-Llama-3.1-8B-Instruct_7bits.json \ - --accuracy \ - --batch_size 32 \ - --enable_torch_compile - -# Llama 3.3 70B -deepspeed --include="localhost:0,1,2,3" --master_port=29500 quantize.py \ - --model_name_or_path meta-llama/Llama-3.3-70B-Instruct/ \ - --quantize \ - --dtype MXFP4 \ - --use_recipe \ - --recipe_file recipes/Meta-Llama-3.3-70B-Instruct_5bits.json \ - --accuracy \ - --batch_size 32 -``` - -> Note: -> 1. Quantization applies `--dtype` for all blocks in the model by removing `--use_recipe`. -> 2. Setting `--quant_lm_head` applies `--dtype` for the lm_head layer. -> 3. Setting `--iters 0` skips AutoRound tuning and uses RTN method. -> 4. The `deepspeed` usage provides quick accuracy verification. - -## Inference usage - -### NVFP4 -NVFP4 is supported by vLLM already, the saved model in this example follows the `llm_compressor` format, please refer to the usage in the public vLLM document. - -```bash -# Command to save model: -python quantize.py --model_name_or_path facebook/opt-125m --quantize --dtype NVFP4 --batch_size 8 --save --save_path opt-125m-nvfp4 --save_format llm_compressor -``` - -### MXFP4 / MXFP8 -MXFP4 and MXFP8 is enabled in a forked vLLM repo, usages as below: -```bash -# Install the forked vLLM -git clone -b cuda-mxfp8-moe --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -USE_CPP=0 VLLM_USE_PRECOMPILED=1 pip install -e . -vvv && cd - - -# Command to save model: -python quantize.py \ - --model_name_or_path meta-llama/Llama-3.3-70B-Instruct/ \ - --quantize \ - --iters 0 \ - --dtype MXFP4 \ - --save_path Llama-3.3-70B-Instruct-MXFP4 \ - --save \ - --save_format llm_compressor - -# Command to inference with vLLM: -CUDA_VISIBLE_DEVICES=0,1 VLLM_USE_V1=0 VLLM_USE_MXFP4_CT_EMULATIONS=1 VLLM_LOGGING_LEVEL=DEBUG \ -vllm serve Llama-3.3-70B-Instruct-MXFP4 --tensor-parallel-size=2 --port 7777 --host localhost --trust-remote-code --dtype bfloat16 --enforce-eager -export no_proxy="localhost, 127.0.0.1, ::1" -curl -X POST http://localhost:7777/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "/model_path/Llama-3.3-70B-Instruct-MXFP4", - "prompt": "Solve the following math problem step by step: What is 25 + 37? Please answer directly with the result.", - "max_tokens": 100, - "temperature": 0.7, - "top_p": 1.0 - }' -``` -> Note: To inference with transformers, please save model with `--save_format auto_round` and try `python run_hf_inf.py ${model_name_or_path}` - -### MXFP4 + MXFP8 -Model with mixed precision is not supported in vLLM, but supported in transformers in `auto-round` format. - -```bash -# Command to save model: -python quantize.py \ - --model_name_or_path meta-llama/Llama-3.1-8B-Instruct \ - --quantize \ - --dtype MXFP4 \ - --use_recipe \ - --recipe_file recipes/Meta-Llama-3.1-8B-Instruct_7bits.json \ - --save \ - --save_format auto_round \ - --save_path Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR \ - --enable_torch_compile - -# Command to inference with transformer: -python run_hf_inf.py Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR -``` diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py deleted file mode 100644 index 7cd4bc9996a..00000000000 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch -import transformers - -# For reproducibility -torch.manual_seed(42) -torch.use_deterministic_algorithms(True, warn_only=True) -######################## HPU Memory Optimization ########################### -# ensure that unnecessary memory is released during quantization. -os.environ.setdefault("PT_HPU_LAZY_MODE", "1") -os.environ.setdefault("PT_HPU_WEIGHT_SHARING", "0") -if int(os.getenv("WORLD_SIZE", "0")) > 0: - os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") - os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") -from neural_compressor.torch.utils import is_hpex_available, world_size -from auto_round import AutoRound - -if is_hpex_available(): - import habana_frameworks.torch.core as htcore - from habana_frameworks.torch.hpu import wrap_in_hpu_graph - - htcore.hpu_set_env() -############################################################################ - - -def initialize_model_and_tokenizer(model_name_or_path): - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) - config = transformers.AutoConfig.from_pretrained(model_name_or_path) - # using memory mapping with torch_dtype=config.torch_dtype - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=config.torch_dtype) - # shard model for multi-cards and enable hpu graph - - if world_size > 1: - ds_inference_kwargs = { - "dtype": config.torch_dtype, - "tensor_parallel": {"tp_size": world_size}, - } - import deepspeed - - ds_model = deepspeed.init_inference(model, **ds_inference_kwargs) - model = ds_model.module - model.eval() - return model, tokenizer - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Habana FP8 quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "--model_name_or_path", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct", help="model name or path" - ) - parser.add_argument("--dtype", type=str, default="MXFP4", choices=["MXFP4", "MXFP8", "NVFP4", "NVFP4+", "uNVFP4"], help="data type") - parser.add_argument("--quantize", action="store_true", help="whether to quantize model") - parser.add_argument("--device_map", type=str, default=None, help="device map for model") - parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model") - parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file") - parser.add_argument("--mem_per_param_scale", default=13, type=int, help="memory per param scale factor") - parser.add_argument("--iters", default=200, type=int, help="iters for autoround.") - parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.") - parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.") - parser.add_argument("--save", action="store_true", help="whether to save the quantized model") - parser.add_argument("--save_path", type=str, default="saved_results", help="path to save the quantized model") - parser.add_argument("--save_format", type=str, default="auto_round", help="format to save the quantized model") - parser.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch.compile") - parser.add_argument("--quant_lm_head", action="store_true", help="whether to quantize lm_head") - parser.add_argument("--accuracy", action="store_true", help="accuracy measurement") - parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.") - parser.add_argument("--batch_size", default=32, type=int, help="batch size for accuracy evaluation.") - parser.add_argument( - "--tasks", - type=str, - nargs="+", - default=[ - "piqa", - "hellaswag", - "mmlu", - "winogrande", - "lambada_openai", - ], - help="tasks for accuracy validation, text-generation and code-generation tasks are different.", - ) - parser.add_argument("--limit", type=int, default=None, help="number of samples for accuracy evaluation") - args = parser.parse_args() - - print("Target data type:", args.dtype) - - model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path) - device="hpu" if is_hpex_available() else "cuda" - # in case that model is set to cuda:0 by default - if args.device_map.isdigit() and device=="cuda": - device = f"{device}:{args.device_map}" - - if args.quantize: - if args.dtype in ["uNVFP4", "NVFP4+"]: - from auto_round.schemes import QuantizationScheme - - uNVFP4 = QuantizationScheme.from_dict( - { - "bits": 4, - "group_size": 16, - "data_type": "fp4_v2", - "act_bits": 4, - "act_data_type": "fp4_v2", - "act_group_size": 16, - "act_sym": True, - } - ) - args.dtype = uNVFP4 - - if args.quant_lm_head: - layer_config = {"lm_head": args.dtype} - - autoround = AutoRound( - model, - tokenizer, - device=device, - device_map="tp" if world_size > 1 else args.device_map, - iters=args.iters, - seqlen=args.seqlen, - nsamples=args.nsamples, - low_gpu_mem_usage=True, - scheme=args.dtype, - layer_config=layer_config if args.quant_lm_head else None, - enable_torch_compile=args.enable_torch_compile, - mem_per_param_scale=args.mem_per_param_scale, - ) - - if args.use_recipe: - ############ load recipe results (MXFP4 + MXFP8) ############ - def load_recipe_results(file_path): - import json - with open(file_path, "r") as f: - return json.load(f) - - layer_config = load_recipe_results(args.recipe_file) - if args.quant_lm_head: - # ensure lm_head is quantized with mxfp8_config - layer_config.update({"lm_head": "MXFP8"}) - print("In recipe mode, lm_head is quantized with MXFP8.") - autoround.layer_config = layer_config - - # A placeholder, to pass assertion in AutoRound - autoround.formats = "auto_round" - autoround.quantize() - model = autoround.model - - if args.accuracy: - # set dtype to BF16 for HPU inference performance - model = model.to(torch.bfloat16) - model = model.eval().to(device) - if is_hpex_available(): - # HPU needs padding to buckets for better performance - # Generation tasks, such as gsm8k and mmlu-pro, may get OOM. - model = wrap_in_hpu_graph(model) - htcore.hpu_inference_initialize(model, mark_only_scales_as_const=True) - from neural_compressor.evaluation.lm_eval import LMEvalParser, evaluate - - tasks = ",".join(args.tasks) - eval_args = LMEvalParser( - model="hf", - user_model=model, - tokenizer=tokenizer, - batch_size=args.batch_size, - tasks=tasks, - device="hpu", - pad_to_buckets=True, - limit=args.limit, - add_bos_token=True, - ) - results = evaluate(eval_args) - torch.hpu.synchronize() - all_accuracy = {} - for task_name, task_results in results["results"].items(): - if task_name in ["hellaswag", "lambada_openai", "piqa", "winogrande", "mmlu"]: - accu = task_results["acc,none"] - all_accuracy[task_name] = accu - print(f"Accuracy for {task_name}: {accu:.4f}") - print(f"Overall accuracy: {sum(all_accuracy.values())/len(all_accuracy):.4f}") - else: - # CUDA evaluation support all tasks. - # gsm8k requires add_bos_token=False for better accuracy for llama model. - args.tasks = ["piqa", "hellaswag", "mmlu", "gsm8k"] - all_accuracy = {} - test_gsm8k = False - test_normal = False - if "gsm8k" in args.tasks: - test_gsm8k = True - args.tasks.remove("gsm8k") - if args.tasks: - test_normal = True - import lm_eval - from lm_eval.models.huggingface import HFLM - - ########################## gms8k (ahead of normal tasks) ######################### - if test_gsm8k: - lm = HFLM( - pretrained=model, - tokenizer=tokenizer, - add_bos_token=False, - batch_size=args.batch_size, - ) - results_gsm8k = lm_eval.simple_evaluate( - lm, - tasks=["gsm8k"], - limit=args.limit, - ) - for task_name, task_results in results_gsm8k["results"].items(): - accu = task_results["exact_match,strict-match"] - all_accuracy[task_name] = accu - ########################## gms8k end ######################### - if test_normal: - lm = HFLM( - pretrained=model, - tokenizer=tokenizer, - add_bos_token=True, - batch_size=args.batch_size, - ) - results = lm_eval.simple_evaluate( - lm, - tasks=args.tasks, - limit=args.limit, - ) - for task_name, task_results in results["results"].items(): - if task_name in ["hellaswag", "lambada_openai", "piqa", "winogrande", "mmlu"]: - accu = task_results["acc,none"] - all_accuracy[task_name] = accu - for task_name, accu in all_accuracy.items(): - print(f"Accuracy for {task_name}: {accu:.4f}") - print(f"Overall accuracy: {sum(all_accuracy.values())/len(all_accuracy):.4f}") - - if args.save: - if args.dtype == "NVFP4": - # using llm_compressor format to save nv_fp4 model - autoround.save_quantized(args.save_path, format=args.save_format) - else: - # using auto_round format to save mx_fp4 and mx_fp8 model - if world_size > 1: - print(f"Suggest to save model without sharding for better reload experience.") - print(f"Setting`--device_map 0,1,2,3` provides pipeline parallel instead of deepspeed tensor parallel.") - output_dir = args.save_path + "/" + args.local_rank + "_" + args.world_size - autoround.save_quantized(output_dir, format=args.save_format) - else: - autoround.save_quantized(args.save_path, format=args.save_format) - print(f"Quantized model in {args.save_format} format is saved to {args.save_path}") diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.1-8B-Instruct_7bits.json b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.1-8B-Instruct_7bits.json deleted file mode 100644 index 49b4e3a56d6..00000000000 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.1-8B-Instruct_7bits.json +++ /dev/null @@ -1,2242 +0,0 @@ -{ - "model.layers.0.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - } -} \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.3-70B-Instruct_5bits.json b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.3-70B-Instruct_5bits.json deleted file mode 100644 index 105c6daa492..00000000000 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/recipes/Meta-Llama-3.3-70B-Instruct_5bits.json +++ /dev/null @@ -1,5602 +0,0 @@ -{ - "model.layers.0.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.0.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.1.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.2.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.3.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.4.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.5.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.6.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.7.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.8.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.9.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.10.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.11.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.12.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.13.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.14.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.15.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.16.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.17.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.18.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.19.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.20.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.21.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.22.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.23.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.24.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.25.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.26.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.27.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.28.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.29.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.30.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.31.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.32.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.33.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.34.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.35.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.36.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.37.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.38.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.39.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.40.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.41.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.42.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.43.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.44.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.45.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.46.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.47.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.48.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.49.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.50.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.51.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.52.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.53.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.54.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.55.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.56.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.57.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.58.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.59.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.60.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.61.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.62.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.63.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.64.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.65.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.66.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.67.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.68.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.69.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.70.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.71.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.72.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.73.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.74.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.75.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.76.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.77.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.78.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.self_attn.q_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.self_attn.k_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.self_attn.v_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.self_attn.o_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 4, - "group_size": 32, - "sym": true, - "act_bits": 4, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.mlp.gate_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.mlp.up_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - }, - "model.layers.79.mlp.down_proj": { - "data_type": "mx_fp", - "act_data_type": "mx_fp_rceil", - "bits": 8, - "group_size": 32, - "sym": true, - "act_bits": 8, - "act_group_size": 32, - "act_sym": true - } -} \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/run_hf_inf.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/run_hf_inf.py deleted file mode 100644 index 06f479609ec..00000000000 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/run_hf_inf.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch -import sys - - -quantized_model_path = sys.argv[1] -print("model name or path:", quantized_model_path) -with torch.no_grad(), torch.device("cuda"): - import transformers - - model = transformers.AutoModelForCausalLM.from_pretrained( - quantized_model_path, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - trust_remote_code=True, - device_map="auto", - ) - tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) - prompt = "Solve the following math problem step by step: What is 25 + 37? Please answer directly with the result." - - encode = tokenizer.encode(prompt, return_tensors="pt") - with torch.no_grad(): - output_tokens = model.generate( - encode, - max_length=200, - ) - output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) - print(f"Prompt: {prompt}") - print(f"Output: {output}") - assert output is not None, "Output should not be None" \ No newline at end of file diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py index b8c1eac83ab..2ad0eb61744 100644 --- a/neural_compressor/common/base_config.py +++ b/neural_compressor/common/base_config.py @@ -321,6 +321,7 @@ def to_dict(self): result[GLOBAL] = global_config else: result = global_config + result.pop("params_list", None) # Internal parameters return result def get_params_dict(self): @@ -531,7 +532,7 @@ def expand(self) -> List[BaseConfig]: # Assign the options to the `TuningParam` instance param_val = getattr(config, tuning_param.name) if param_val is not None: - if tuning_param.is_tunable(param_val): + if param not in self.non_tunable_params and tuning_param.is_tunable(param_val): tuning_param.options = param_val tuning_param_list.append(tuning_param) else: diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 2ab9aead9e2..1c1821c3891 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -211,7 +211,8 @@ def __init__( self.super_group_size = super_group_size self.batch_size = batch_size self.amp = amp - self.device = get_accelerator(kwargs.pop("device", "auto")).name() + self.accelerator = get_accelerator(kwargs.pop("device", "auto")) + self.device = self.accelerator.name() self.lr_scheduler = lr_scheduler self.dataset = dataset self.enable_quanted_input = enable_quanted_input @@ -302,6 +303,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): device_map=self.auto_scheme_device_map, low_gpu_mem_usage=self.low_gpu_mem_usage, ) + rounder = AutoRound( model, layer_config=self.layer_config, @@ -349,6 +351,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): enable_norm_bias_tuning=self.enable_norm_bias_tuning, truncation=self.truncation, enable_torch_compile=self.enable_torch_compile, + # TODO: AutoRound is using layer_config to quantize lm_head, remove it. quant_lm_head=self.quant_lm_head, guidance_scale=self.guidance_scale, num_inference_steps=self.num_inference_steps, @@ -370,6 +373,18 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): dump_model_op_stats(rounder.layer_config) + if self.export_format in ["auto_round", "llm_compressor"]: + # the directly returned model is QuantLinear, which is used for packing. + try: + del model + self.accelerator.empty_cache() + logger.info("Quantization is done, reloading model from saved directory...") + import transformers # pylint: disable=E0401 + + model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir) + except: + pass + return model diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index dd1bc132776..b1183ef3c64 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -85,6 +85,10 @@ class TorchBaseConfig(BaseConfig): # re-write func _get_op_name_op_type_config to fallback op_type with string # because there are some special op_types for IPEX backend: `Linear&Relu`, `Linear&add`, ... + def __init__(self, white_list): + super().__init__(white_list) + self.non_tunable_params: List[str] = ["white_list"] + def _get_op_name_op_type_config(self): op_type_config_dict = dict() op_name_config_dict = dict() @@ -103,8 +107,6 @@ def _get_op_name_op_type_config(self): def _generate_params_list(cls) -> List[str]: sig = inspect.signature(cls.__init__) params_list = list(sig.parameters.keys())[1:] - if "white_list" in params_list: - params_list.remove("white_list") if "args" in params_list: params_list.remove("args") if "kwargs" in params_list: @@ -118,26 +120,6 @@ class RTNConfig(TorchBaseConfig): """Config class for round-to-nearest weight-only quantization.""" name = RTN - params_list = [ - "dtype", - "bits", - "use_sym", - "group_size", - "group_dim", - "use_full_range", - "use_mse_search", - # layer wise params - "use_layer_wise", - "model_path", - # double quant - "use_double_quant", - "double_quant_dtype", - "double_quant_bits", - "double_quant_use_sym", - "double_quant_group_size", - # quant_lm_head - "quant_lm_head", - ] supported_configs: List[OperatorConfig] = [] def __init__( @@ -346,32 +328,6 @@ class GPTQConfig(TorchBaseConfig): name = GPTQ supported_configs: List[OperatorConfig] = [] - params_list = [ - "dtype", - "bits", - "use_sym", - "group_size", - "use_mse_search", - "use_double_quant", - "double_quant_dtype", - "double_quant_bits", - "double_quant_use_sym", - "double_quant_group_size", - # layer wise params - "use_layer_wise", - "use_block_wise", - "model_path", - # quant lm_head - "quant_lm_head", - # gptq params - "act_order", - "hybrid_order", - "fp8_aware", - "percdamp", - "block_size", - "static_groups", - "true_sequential", - ] def __init__( self, @@ -574,28 +530,6 @@ class AWQConfig(TorchBaseConfig): """ supported_configs: List[OperatorConfig] = [] - params_list = [ - "dtype", - "bits", - "group_size", - "group_dim", - "use_sym", - "use_full_range", - "use_mse_search", - "use_layer_wise", - "use_double_quant", - "double_quant_dtype", - "double_quant_bits", - "double_quant_use_sym", - "double_quant_group_size", - # quant_lm_head - "quant_lm_head", - # AWQ params - "use_auto_scale", - "use_auto_clip", - "folding", - "absorb_layer_dict", - ] name = AWQ def __init__( @@ -756,26 +690,6 @@ class TEQConfig(TorchBaseConfig): """ supported_configs: List[OperatorConfig] = [] - params_list = [ - "dtype", - "bits", - "group_size", - "group_dim", - "use_sym", - "use_full_range", - "use_mse_search", - "use_layer_wise", - "use_double_quant", - "double_quant_dtype", - "double_quant_bits", - "double_quant_use_sym", - "double_quant_group_size", - # quant_lm_head - "quant_lm_head", - # TEQ params - "absorb_to_layer", - "folding", - ] name = TEQ def __init__( @@ -926,25 +840,6 @@ class AutoRoundConfig(TorchBaseConfig): """ supported_configs: List[OperatorConfig] = [] - params_list = [ - "dtype", - "bits", - "group_size", - "use_sym", - # autoround params - "enable_full_range", - "batch_size", - "enable_minmax_tuning", - "lr", - "minmax_lr", - "iters", - "seqlen", - "nsamples", - "nblocks", - "gradient_accumulate_steps", - "not_use_best_mse", - "dynamic_max_gap", - ] name = AUTOROUND def __init__( @@ -1000,13 +895,14 @@ def __init__( # v0.8 enable_adam: bool = False, # v0.9: auto scheme parameters - target_bits: int = None, + target_bits: float = None, options: Union[str, list[Union[str]], tuple[Union[str], ...]] = ("MXFP4", "MXFP8"), shared_layers: Optional[Iterable[Iterable[str]]] = None, ignore_scale_zp_bits: bool = False, auto_scheme_method: str = "default", auto_scheme_device_map: str = None, auto_scheme_batch_size: int = None, + output_dir: str = "./temp_auto_round", # Tuning space white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, **kwargs, @@ -1060,17 +956,19 @@ def __init__( device_map: The device to be used for tuning. scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. - target_bits (int): The target bit width for quantization (default is None). + target_bits (float): The target bit width for quantization (default is None). options (Union[str, list[Union[str]], tuple[Union[str], ...]]): The options for mixed-precision quantization. shared_layers (Optional[Iterable[Iterable[str]]]): The shared layers for mixed-precision quantization. ignore_scale_zp_bits (bool): Whether to ignore scale and zero-point bits (default is False). auto_scheme_method (str): The method for automatic scheme selection (default is "default"). auto_scheme_device_map (str): The device map for automatic scheme selection (default is None). auto_scheme_batch_size (int): The batch size for automatic scheme selection (default is 8). + output_dir (str): The output directory for temporary files (default is "./temp_auto_round"). """ super().__init__(white_list=white_list) self.params_list = self.__class__._generate_params_list() - self.params_list.remove("options") # option is a list but not a tunable parameter + # these two params are lists but not tunable + self.non_tunable_params.extend(["options", "shared_layers"]) self.enable_full_range = enable_full_range self.batch_size = batch_size @@ -1124,6 +1022,7 @@ def __init__( self.auto_scheme_method = auto_scheme_method self.auto_scheme_device_map = auto_scheme_device_map self.auto_scheme_batch_size = auto_scheme_batch_size + self.output_dir = output_dir # add kwargs for k, v in kwargs.items(): setattr(self, k, v) @@ -1236,14 +1135,6 @@ class MXQuantConfig(TorchBaseConfig): """Config class for MX quantization.""" supported_configs: List[OperatorConfig] = [] - params_list = [ - "w_dtype", - "act_dtype", - "out_dtype", - "blocksize", - "round_method", - "weight_only", - ] name = MX_QUANT def __init__( @@ -1362,16 +1253,6 @@ class DynamicQuantConfig(TorchBaseConfig): """Config class for dynamic quantization.""" name = PT2E_DYNAMIC_QUANT - params_list = [ - "w_dtype", - "w_sym", - "w_granularity", - "w_algo", - "act_dtype", - "act_sym", - "act_granularity", - "act_algo", - ] supported_configs: List[OperatorConfig] = [] def __init__( @@ -1457,17 +1338,6 @@ class INT8StaticQuantConfig(TorchBaseConfig): """Config class for static quantization.""" name = STATIC_QUANT - params_list = [ - "w_dtype", - "w_sym", - "w_granularity", - "w_algo", - "act_dtype", - "act_sym", - "act_granularity", - "act_algo", - "excluded_precisions", - ] supported_configs: List[OperatorConfig] = [] def __init__( @@ -1616,21 +1486,6 @@ class SmoothQuantConfig(TorchBaseConfig): """Config class for smooth quantization.""" name = SMOOTH_QUANT - params_list = [ - "w_dtype", - "w_sym", - "w_granularity", - "w_algo", - "act_dtype", - "act_sym", - "act_granularity", - "act_algo", - "excluded_precisions", - "alpha", - "folding", - "scale_sharing", - "auto_alpha_args", - ] supported_configs: List[OperatorConfig] = [] def __init__( @@ -1777,14 +1632,6 @@ class HQQConfig(TorchBaseConfig): """ name = HQQ - params_list = [ - "bits", - "group_size", - "quant_zero", - "quant_scale", - "scale_quant_group_size", - "quant_lm_head", - ] supported_configs: List[OperatorConfig] = [] def __init__( @@ -2101,9 +1948,6 @@ class MixedPrecisionConfig(TorchBaseConfig): name = MIXED_PRECISION supported_configs: List[OperatorConfig] = [] - params_list = [ - "dtype", - ] supported_half_precision_ops = ( torch.nn.Linear, torch.nn.Conv1d, diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py index f66e1da9d54..ac9b25aeeee 100644 --- a/neural_compressor/torch/utils/auto_accelerator.py +++ b/neural_compressor/torch/utils/auto_accelerator.py @@ -24,6 +24,7 @@ # To keep it simply, only add the APIs we need. +import gc import os from abc import ABC, abstractmethod from enum import Enum, auto @@ -206,7 +207,7 @@ def device(self, device_index=None): def empty_cache(self): """Do nothing.""" - pass + gc.collect() def synchronize(self): """Do nothing.""" From 709cc71876243e856caf713ee6969c3123b49b7f Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 25 Nov 2025 03:50:08 -0500 Subject: [PATCH 02/10] update requirement Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index d33d8090ed3..8c5ae9a028d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -6,13 +6,25 @@ In this example, you can verify the accuracy on HPU/CUDA device with emulation o ```bash # neural-compressor-pt -pip install neural-compressor-pt>=3.6 +pip install neural-compressor-pt==3.7 # auto-round -pip install auto-round>=0.8.0 +pip install auto-round==0.9.1 # other requirements pip install -r requirements.txt ``` +**Before neural-compressor v3.7 and auto-round v0.9.1 release, please install from source for the latest updates:** + +```bash +# neural-compressor-pt +INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master +# auto-round +pip install git+https://github.com/intel/auto-round.git@main +# other requirements +pip install -r requirements.txt +``` + + ## Quantization ### Demo (`MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`) From cc25af55de9420e7e8b18768b11324bcd937ee04 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 26 Nov 2025 00:55:59 -0500 Subject: [PATCH 03/10] add run_quant run_benchmark Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 127 +++++-------- .../auto_round/llama3/quantize.py | 2 +- .../auto_round/llama3/run_benchmark.sh | 119 +++++++++++++ .../auto_round/llama3/run_quant.sh | 167 ++++++++++++++++++ .../torch/algorithms/weight_only/autoround.py | 2 + 5 files changed, 338 insertions(+), 79 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 8c5ae9a028d..d8c842eaab2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -84,81 +84,37 @@ CUDA_VISIBLE_DEVICES=0 python quantize.py \ AutoRound helps improve the accuracy, `iters` and `nsamples` is higher than default. ```bash # Quantize and export AutoRound format -CUDA_VISIBLE_DEVICES=0 python quantize.py \ - --model_name_or_path /models/Meta-Llama-3.1-8B-Instruct \ - --quantize \ - --dtype MXFP8 \ - --iters 1000 \ - --nsamples 512 \ - --enable_torch_compile \ - --low_gpu_mem_usage \ - --export_format auto_round \ - --export_path Llama-3.1-8B-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8 ``` -#### Llama 3.1 8B MXFP4 (Mixed with MXFP8) +#### Llama 3.1 8B MXFP4 (Mixed with MXFP8, Target_bits=7.8) ```bash -CUDA_VISIBLE_DEVICES=0 python quantize.py \ - --model_name_or_path /models/Meta-Llama-3.1-8B-Instruct \ - --quantize \ - --target_bits 7.8 \ - --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ - --enable_torch_compile \ - --low_gpu_mem_usage \ - --export_format auto_round \ - --export_path Llama-3.1-8B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp4_mixed --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP4-MXFP8 ``` #### Llama 3.3 70B MXFP8 ```bash -CUDA_VISIBLE_DEVICES=0 python quantize.py \ - --model_name_or_path /models/Llama-3.3-70B-Instruct/ \ - --quantize \ - --dtype MXFP8 \ - --quant_lm_head \ - --iters 0 \ - --enable_torch_compile \ - --low_gpu_mem_usage \ - --export_format auto_round \ - --export_path Llama-3.3-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8 ``` -#### Llama 3.3 70B MXFP4 (Mixed with MXFP8) +#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) ```bash -CUDA_VISIBLE_DEVICES=0 python quantize.py \ - --model_name_or_path /models/Llama-3.3-70B-Instruct/ \ - --quantize \ - --target_bits 5.8 \ - --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ - --enable_torch_compile \ - --low_gpu_mem_usage \ - --export_format auto_round \ - --export_path Llama-3.3-70B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp4_mixed --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP4-MXFP8 ``` -#### Llama 3.1 70B uNVFP4 +#### Llama 3.1 70B MXFP8 ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 python quantize.py \ - --model_name_or_path /models/Llama-3.1-70B-Instruct/ \ - --quantize \ - --dtype uNVFP4 \ - --quant_lm_head \ - --iters 0 \ - --enable_torch_compile \ - --low_gpu_mem_usage \ - --export_format fake \ - --export_path Llama-3.1-70B-uNVFP4 \ - --accuracy +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8 ``` +#### Llama 3.1 70B uNVFP4 +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=unvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-uNVFP4 +``` Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES` or reducing `eval_batch_size` is suggested. ## Inference @@ -177,28 +133,43 @@ git checkout fused-moe-ar VLLM_USE_PRECOMPILED=1 pip install -e . ``` -#### Accuracy Evaluation -```bash -# add_bos_token=True helps accuracy for general tasks -VLLM_ENABLE_AR_EXT=1 \ -TORCH_COMPILE_DISABLE=1 \ -CUDA_VISIBLE_DEVICES=0 \ -lm_eval --model vllm \ - --model_args pretrained=Llama-3.1-8B-MXFP4-MXFP8,add_bos_token=True,tensor_parallel_size=1,data_parallel_size=1 \ - --tasks piqa,hellaswag,mmlu \ - --batch_size 8 & -wait -# add_bos_token=True helps accuracy for GSM8K -VLLM_ENABLE_AR_EXT=1 \ -TORCH_COMPILE_DISABLE=1 \ -CUDA_VISIBLE_DEVICES=0 \ -lm_eval --model vllm \ - --model_args pretrained=Llama-3.1-8B-MXFP4-MXFP8,add_bos_token=False,tensor_parallel_size=1,data_parallel_size=1 \ - --tasks gsm8k \ - --batch_size 8 -``` - -Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES`+`tensor_parallel_size` or reducing `batch_size` is suggested. +#### MXFP Benchmark Script + +For convenience, we provide a benchmark script that automatically handles GPU detection and tensor parallelism configuration: + +**All 5 MXFP benchmark cases:** + +1. **Llama 3.1 8B MXFP8** (1 GPU): +```bash +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 +``` + +2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU): +```bash +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 +``` + +3. **Llama 3.3 70B MXFP8** (4 GPU): +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 +``` + +4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU): +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 +``` + +5. **Llama 3.1 70B MXFP8** (4 GPU): +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 +``` + +The script automatically: +- Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly +- Handles different `add_bos_token` settings for different tasks (GSM8K requires `False`, others use `True`) +- Runs default tasks: `piqa,hellaswag,mmlu,gsm8k` with batch size 8 +- Supports custom task selection and batch size adjustment + ### NVFP4 NVFP4 is supported by vLLM already, please set `llm_compressor` format for exporting during quantization. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 39d51edfc8f..30878475fb3 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -28,7 +28,7 @@ if int(os.getenv("WORLD_SIZE", "0")) > 0: os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") -from neural_compressor.torch.utils import is_hpex_available, world_size +from neural_compressor.torch.utils import is_hpex_available from neural_compressor.torch.quantization import autotune, prepare, convert, AutoRoundConfig, TuningConfig if is_hpex_available(): diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh new file mode 100644 index 00000000000..87b635be52f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +# Usage: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=] + +# Parse command line arguments +TASKS="piqa,hellaswag,mmlu,gsm8k" +BATCH_SIZE=8 + +while [[ $# -gt 0 ]]; do + case $1 in + --model_path=*) + MODEL_PATH="${1#*=}" + shift + ;; + --tasks=*) + TASKS="${1#*=}" + shift + ;; + --batch_size=*) + BATCH_SIZE="${1#*=}" + shift + ;; + *) + echo "Unknown parameter: $1" + exit 1 + ;; + esac +done + +# Validate required parameters +if [[ -z "$MODEL_PATH" ]]; then + echo "Usage: bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=]" + echo "Example: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8" + exit 1 +fi + +# Count available GPUs and set tensor_parallel_size +if [[ -n "$CUDA_VISIBLE_DEVICES" ]]; then + # Count comma-separated GPU IDs + IFS=',' read -ra GPU_ARRAY <<< "$CUDA_VISIBLE_DEVICES" + TENSOR_PARALLEL_SIZE=${#GPU_ARRAY[@]} +else + TENSOR_PARALLEL_SIZE=1 +fi + +echo "Running benchmark with parameters:" +echo " Model Path: $MODEL_PATH" +echo " Tasks: $TASKS" +echo " Batch Size: $BATCH_SIZE" +echo " Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" +echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + +# Check if the model exists +if [[ ! -d "$MODEL_PATH" ]]; then + echo "Error: Model path '$MODEL_PATH' does not exist!" + exit 1 +fi + +# Set common environment variables +export VLLM_ENABLE_AR_EXT=1 +export TORCH_COMPILE_DISABLE=1 + +# Function to run evaluation for specific tasks +run_evaluation() { + local tasks=$1 + local add_bos_token=$2 + + echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" + + # Print the command being executed + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + echo "Executing command: $cmd" + + lm_eval --model vllm \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \ + --tasks $tasks \ + --batch_size $BATCH_SIZE + + if [[ $? -ne 0 ]]; then + echo "Error: Evaluation failed for tasks: $tasks" + return 1 + fi +} + +# Check if tasks contain gsm8k (requires add_bos_token=False) +if [[ "$TASKS" == *"gsm8k"* ]]; then + # If gsm8k is the only task + if [[ "$TASKS" == "gsm8k" ]]; then + run_evaluation "$TASKS" false + else + # Split tasks: run gsm8k separately with add_bos_token=False + OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//') + + if [[ -n "$OTHER_TASKS" ]]; then + echo "Running general tasks with add_bos_token=True" + run_evaluation "$OTHER_TASKS" true + + if [[ $? -eq 0 ]]; then + echo "Running GSM8K with add_bos_token=False" + run_evaluation "gsm8k" false + else + echo "Skipping GSM8K due to previous failure" + exit 1 + fi + else + run_evaluation "gsm8k" false + fi + fi +else + # No gsm8k task, use add_bos_token=True for all tasks + run_evaluation "$TASKS" true +fi + +if [[ $? -eq 0 ]]; then + echo "Benchmark completed successfully!" +else + echo "Benchmark failed!" + exit 1 +fi \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh new file mode 100644 index 00000000000..43ba9c2d18d --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +# Usage: CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8 + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --topology=*) + TOPOLOGY="${1#*=}" + shift + ;; + --dtype=*) + DTYPE="${1#*=}" + shift + ;; + --input_model=*) + INPUT_MODEL="${1#*=}" + shift + ;; + --output_model=*) + OUTPUT_MODEL="${1#*=}" + shift + ;; + *) + echo "Unknown parameter: $1" + exit 1 + ;; + esac +done + +# Validate required parameters +if [[ -z "$TOPOLOGY" || -z "$DTYPE" || -z "$INPUT_MODEL" || -z "$OUTPUT_MODEL" ]]; then + echo "Usage: bash run_quant.sh --topology= --dtype= --input_model= --output_model=" + echo "Supported topologies: Llama-3.1-8B, Llama-3.3-70B, Llama-3.1-70B" + echo "Supported dtypes: mxfp8, mxfp4_mixed, unvfp4" + exit 1 +fi + +echo "Starting quantization with parameters:" +echo " Topology: $TOPOLOGY" +echo " Data Type: $DTYPE" +echo " Input Model: $INPUT_MODEL" +echo " Output Model: $OUTPUT_MODEL" + +# Set common parameters +COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round" + +case "$TOPOLOGY" in + "Llama-3.1-8B") + case "$DTYPE" in + "mxfp8") + echo "Running Llama 3.1 8B MXFP8 quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 1000 --nsamples 512 --export_path \"$OUTPUT_MODEL\"" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + $COMMON_ARGS \ + --dtype MXFP8 \ + --iters 1000 \ + --nsamples 512 \ + --export_path "$OUTPUT_MODEL" + ;; + "mxfp4_mixed") + echo "Running Llama 3.1 8B MXFP4 (Mixed with MXFP8) quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + $COMMON_ARGS \ + --target_bits 7.8 \ + --options "MXFP4" "MXFP8" \ + --shared_layer "k_proj" "v_proj" "q_proj" \ + --shared_layer "gate_proj" "up_proj" \ + --export_path "$OUTPUT_MODEL" + ;; + *) + echo "Error: Unsupported dtype '$DTYPE' for topology '$TOPOLOGY'" + echo "Supported dtypes for Llama-3.1-8B: mxfp8, mxfp4_mixed" + exit 1 + ;; + esac + ;; + "Llama-3.3-70B") + case "$DTYPE" in + "mxfp8") + echo "Running Llama 3.3 70B MXFP8 quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + $COMMON_ARGS \ + --dtype MXFP8 \ + --quant_lm_head \ + --iters 0 \ + --export_path "$OUTPUT_MODEL" + ;; + "mxfp4_mixed") + echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + $COMMON_ARGS \ + --target_bits 5.8 \ + --options "MXFP4" "MXFP8" \ + --shared_layer "k_proj" "v_proj" "q_proj" \ + --shared_layer "gate_proj" "up_proj" \ + --export_path "$OUTPUT_MODEL" + ;; + *) + echo "Error: Unsupported dtype '$DTYPE' for topology '$TOPOLOGY'" + echo "Supported dtypes for Llama-3.3-70B: mxfp8, mxfp4_mixed" + exit 1 + ;; + esac + ;; + "Llama-3.1-70B") + case "$DTYPE" in + "mxfp8") + echo "Running Llama 3.1 70B MXFP8 quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + $COMMON_ARGS \ + --dtype MXFP8 \ + --quant_lm_head \ + --iters 0 \ + --export_path "$OUTPUT_MODEL" + ;; + "unvfp4") + echo "Running Llama 3.1 70B uNVFP4 quantization..." + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" --quantize --dtype uNVFP4 --quant_lm_head --iters 0 --enable_torch_compile --low_gpu_mem_usage --export_format fake --export_path \"$OUTPUT_MODEL\" --accuracy" + echo "Executing command: $CMD" + python quantize.py \ + --model_name_or_path "$INPUT_MODEL" \ + --quantize \ + --dtype uNVFP4 \ + --quant_lm_head \ + --iters 0 \ + --enable_torch_compile \ + --low_gpu_mem_usage \ + --export_format fake \ + --export_path "$OUTPUT_MODEL" \ + --accuracy + ;; + *) + echo "Error: Unsupported dtype '$DTYPE' for topology '$TOPOLOGY'" + echo "Supported dtypes for Llama-3.3-70B: mxfp8, mxfp4_mixed" + exit 1 + ;; + esac + ;; + *) + echo "Error: Unsupported topology '$TOPOLOGY'" + echo "Supported topologies: Llama-3.1-8B, Llama-3.3-70B" + exit 1 + ;; +esac + +if [[ $? -eq 0 ]]; then + echo "Quantization completed successfully!" + echo "Output model saved to: $OUTPUT_MODEL" +else + echo "Quantization failed!" + exit 1 +fi \ No newline at end of file diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 1c1821c3891..f84ff7e7871 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -384,6 +384,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir) except: pass + else: + self.accelerator.empty_cache() return model From dcd69a27db5c7ff7133df674ca4c070412172755 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 26 Nov 2025 01:02:04 -0500 Subject: [PATCH 04/10] update readme Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index d8c842eaab2..2673f7bf3f2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -127,9 +127,7 @@ Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES` or reducing ```bash # Install the forked vLLM -git clone https://github.com/yiliu30/vllm-fork.git -cd vllm-fork -git checkout fused-moe-ar +git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install -e . ``` From f07ca2da95e8db6ecd6255aa8da44b88456178ac Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 26 Nov 2025 14:03:04 +0800 Subject: [PATCH 05/10] Update neural_compressor/torch/algorithms/weight_only/autoround.py --- neural_compressor/torch/algorithms/weight_only/autoround.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index f84ff7e7871..f236a8dba42 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -351,7 +351,6 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): enable_norm_bias_tuning=self.enable_norm_bias_tuning, truncation=self.truncation, enable_torch_compile=self.enable_torch_compile, - # TODO: AutoRound is using layer_config to quantize lm_head, remove it. quant_lm_head=self.quant_lm_head, guidance_scale=self.guidance_scale, num_inference_steps=self.num_inference_steps, From bca20632edeb4dccf1a294afad7d45ad2152573a Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 26 Nov 2025 14:03:33 +0800 Subject: [PATCH 06/10] Update neural_compressor/common/base_config.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- neural_compressor/common/base_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py index 2ad0eb61744..17ccd7f6457 100644 --- a/neural_compressor/common/base_config.py +++ b/neural_compressor/common/base_config.py @@ -532,7 +532,7 @@ def expand(self) -> List[BaseConfig]: # Assign the options to the `TuningParam` instance param_val = getattr(config, tuning_param.name) if param_val is not None: - if param not in self.non_tunable_params and tuning_param.is_tunable(param_val): + if tuning_param.name not in self.non_tunable_params and tuning_param.is_tunable(param_val): tuning_param.options = param_val tuning_param_list.append(tuning_param) else: From 1d812a0917e1e2130585071582f5d100cc24935a Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 26 Nov 2025 14:03:48 +0800 Subject: [PATCH 07/10] Update neural_compressor/torch/algorithms/weight_only/autoround.py Co-authored-by: Tang Kaihui --- neural_compressor/torch/algorithms/weight_only/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index f236a8dba42..860358f9224 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -377,7 +377,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): try: del model self.accelerator.empty_cache() - logger.info("Quantization is done, reloading model from saved directory...") + logger.info(f"Quantization is done, reloading model from saved directory({self.output_dir})...") import transformers # pylint: disable=E0401 model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir) From 99b8fffc51faad1700e9420f1f612dfe21bfd690 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 26 Nov 2025 21:05:14 -0500 Subject: [PATCH 08/10] fix bug Signed-off-by: He, Xin3 --- neural_compressor/torch/quantization/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index b1183ef3c64..5a86ddd4fb0 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -85,8 +85,9 @@ class TorchBaseConfig(BaseConfig): # re-write func _get_op_name_op_type_config to fallback op_type with string # because there are some special op_types for IPEX backend: `Linear&Relu`, `Linear&add`, ... - def __init__(self, white_list): + def __init__(self, white_list=DEFAULT_WHITE_LIST): super().__init__(white_list) + self.params_list = self.__class__._generate_params_list() self.non_tunable_params: List[str] = ["white_list"] def _get_op_name_op_type_config(self): @@ -966,7 +967,6 @@ def __init__( output_dir (str): The output directory for temporary files (default is "./temp_auto_round"). """ super().__init__(white_list=white_list) - self.params_list = self.__class__._generate_params_list() # these two params are lists but not tunable self.non_tunable_params.extend(["options", "shared_layers"]) From 3ffb650cb8ac9a00176fe32bc13f705bf771eb66 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 27 Nov 2025 21:27:05 -0500 Subject: [PATCH 09/10] update readme and fix CI Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 9 ++++----- neural_compressor/common/base_config.py | 1 + .../torch/algorithms/weight_only/autoround.py | 5 +---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 2673f7bf3f2..3b22bd723a1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -8,7 +8,7 @@ In this example, you can verify the accuracy on HPU/CUDA device with emulation o # neural-compressor-pt pip install neural-compressor-pt==3.7 # auto-round -pip install auto-round==0.9.1 +pip install auto-round==0.9.2 # other requirements pip install -r requirements.txt ``` @@ -19,7 +19,7 @@ pip install -r requirements.txt # neural-compressor-pt INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master # auto-round -pip install git+https://github.com/intel/auto-round.git@main +pip install git+https://github.com/intel/auto-round.git@more-ar-ext # other requirements pip install -r requirements.txt ``` @@ -44,7 +44,7 @@ CUDA_VISIBLE_DEVICES=0 python quantize.py \ ``` Notes: -- Use `--export_format auto_round` for `MXFP4`, `MXFP8` data type and do inference as [below](#mxfp4--mxfp8) +- Use `--export_format auto_round` for `MXFP4`, `MXFP8` data type and do inference as below. - Use `--export_format llm_compressor` for `NVFP4` data type since public vLLM supports it. - Use `--export_format fake` for `uNVFP4` data type since it's not fully supported. - Setting `--quant_lm_head` applies `--dtype` for the lm_head layer. @@ -87,7 +87,6 @@ AutoRound helps improve the accuracy, `iters` and `nsamples` is higher than defa CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8 ``` - #### Llama 3.1 8B MXFP4 (Mixed with MXFP8, Target_bits=7.8) ```bash @@ -119,7 +118,7 @@ Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES` or reducing ## Inference -### MXFP4 / MXFP8 +### MXFP4 & MXFP8 - Both pure MXFP4/MXFP8 and mix-precision model generated by target bits are supported. diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py index 17ccd7f6457..4531f0115ff 100644 --- a/neural_compressor/common/base_config.py +++ b/neural_compressor/common/base_config.py @@ -190,6 +190,7 @@ class BaseConfig(ABC): name = BASE_CONFIG params_list = [] _is_initialized = False + non_tunable_params = ["white_list"] def __init__(self, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST) -> None: """Initialize the BaseConfig. diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 860358f9224..edaac306491 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -370,21 +370,18 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): model = rounder.model model.autoround_config = rounder.layer_config + self.accelerator.empty_cache() dump_model_op_stats(rounder.layer_config) if self.export_format in ["auto_round", "llm_compressor"]: # the directly returned model is QuantLinear, which is used for packing. try: - del model - self.accelerator.empty_cache() logger.info(f"Quantization is done, reloading model from saved directory({self.output_dir})...") import transformers # pylint: disable=E0401 model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir) except: pass - else: - self.accelerator.empty_cache() return model From 54f87bba682a1085b9b8f1566567527390dd1946 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 28 Nov 2025 03:06:42 -0500 Subject: [PATCH 10/10] fix CI Signed-off-by: He, Xin3 --- neural_compressor/torch/quantization/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 5a86ddd4fb0..be6682c737c 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -1794,6 +1794,7 @@ def __init__( self.observer = observer self.mod_dict = mod_dict self._json_file = None + self.measure_exclude = measure_exclude self.fake_quant = str(fake_quant) self.use_qdq = str(use_qdq) self.scale_format = scale_format