diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index efee3a4085f..0d75e7b3902 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -143,6 +143,14 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, + "phi3_vlm_128k_autoround_int4":{ + "model_src_dir": "multimodal-modeling/quantization/auto_round", + "dataset_location": "", + "input_model": "", + "main_script": "mllm.py", + "batch_size": 8, + "iters": 50 + }, "gpt_j_ipex":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py index 5881b45c7b0..84f4976bb31 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,35 +22,7 @@ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" torch.use_deterministic_algorithms(True, warn_only=True) -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor - -from neural_compressor.torch.utils.utility import (get_multimodal_block_names, - get_layer_names_in_block, - detect_device, - find_matching_blocks, - to_device, - to_dtype - ) -from neural_compressor.torch.quantization import (AutoRoundConfig, - prepare, - convert, - load) - -def set_nontext_module_config(model, to_quant_block_names, quant_config): - all_block_list = get_multimodal_block_names(model, quant_vision=True) - all_block_set = set(tuple(block) for block in all_block_list) - quant_block_set = set(tuple(block) for block in to_quant_block_names) - set_to_full_prec = list(all_block_set - quant_block_set) - set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec) - for name in set_to_full_prec: - quant_config.set_local(name, AutoRoundConfig(dtype="fp32")) - - # skip layers not in blocks - quant_config.set_local("model.vision_embed_tokens.img_projection*", AutoRoundConfig(dtype="fp32")) - quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32")) - quant_config.set_local("model.mm_projector*", AutoRoundConfig(dtype="fp32")) - quant_config.set_local("multi_modal_projector", AutoRoundConfig(dtype="fp32")) - quant_config.set_local("visual.merger", AutoRoundConfig(dtype="fp32")) +from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig @torch.no_grad() @@ -116,7 +88,7 @@ def __init__(self, *args, **kwargs): self.add_argument("--low_gpu_mem_usage", action='store_true', help="offload intermediate features to cpu") - self.add_argument("--export_format", default="auto_round:gptq", type=str, + self.add_argument("--export_format", default="itrex", type=str, help="the format to save the model" ) @@ -250,320 +222,88 @@ def tune(args): devices = args.device.replace(" ", "").split(',') use_auto_mapping = True - device_str = detect_device(devices[0]) - - torch_dtype = "auto" - if "hpu" in device_str: - torch_dtype = torch.bfloat16 - - # load_model - processor, image_processor = None, None - if "llava" in model_name: - from llava.model.builder import load_pretrained_model # pylint: disable=E0401 - tokenizer, model, image_processor, _ = load_pretrained_model( - model_name, model_base=None, model_name=model_name, - torch_dtype=torch_dtype) - model_type = "llava" - else: - config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code) - tokenizer = AutoTokenizer.from_pretrained(model_name) - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code) - model_type = config.model_type - if "qwen2_vl" in model_type: - from transformers import Qwen2VLForConditionalGeneration - cls = Qwen2VLForConditionalGeneration - elif "mllama" in model_type: - from transformers import MllamaForConditionalGeneration - cls = MllamaForConditionalGeneration - else: - cls = AutoModelForCausalLM - - kargs = {} - if "phi3_v" in model_type: - kargs['attn_implementation'] = 'eager' - model = cls.from_pretrained( - model_name, trust_remote_code=not args.disable_trust_remote_code, torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, **kargs) - - if "cogvlm2" in model_name: - model.config.model_type = "cogvlm2" - - from neural_compressor.torch.algorithms.weight_only.autoround import get_mllm_dataloader - - model = model.eval() - - if args.model_dtype != None: - try: - if args.model_dtype == "float16" or args.model_dtype == "fp16": - model = model.to(torch.float16) - elif args.model_dtype == "bfloat16" or args.model_dtype == "bfp16" or args.model_dtype == "bf16": - model = model.to(torch.bfloat16) - elif args.model_dtype == "float32" or args.model_dtype == "fp32": - model = model.to(torch.float32) - except: - raise ("please use more device to fit the device or just use one device") - exit() - - all_blocks = get_multimodal_block_names(model, args.quant_nontext_module) - to_quant_block_names = find_matching_blocks(model, all_blocks, args.to_quant_block_names) - - # TODO check dataset? - dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader( - model=model, - tokenizer=tokenizer, - template=None, - dataset=args.dataset, - extra_data_dir=args.extra_data_dir, - seqlen=args.seqlen, - batch_size=args.batch_size, - split=None, - apply_template=None, - truncation=args.truncation, - seed=args.seed, - nsamples=args.nsamples, - gradient_accumulate_steps=args.gradient_accumulate_steps, - quant_nontext_module=args.quant_nontext_module, - processor=processor, - image_processor=image_processor, - ) - quant_config = AutoRoundConfig( - is_mllm=True, + woq_config = AutoRoundConfig( + is_vlm=True, bits=args.bits, - use_sym=not args.asym, + sym=not args.asym, group_size=args.group_size, - nsamples=nsamples, - batch_size=batch_size, + nsamples=args.nsamples, + batch_size=args.batch_size, iters=args.iters, - seqlen=seqlen, + seqlen=args.seqlen, quant_nontext_module=args.quant_nontext_module, - truncation=truncation, - gradient_accumulate_steps=gradient_accumulate_steps, + truncation=args.truncation, + gradient_accumulate_steps=args.gradient_accumulate_steps, nblocks=args.nblocks, lr=args.lr, minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input, - seed=args.seed, scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits, - to_quant_block_names=to_quant_block_names, export_format=args.export_format ) - - # set_nontext_module_config(model, to_quant_block_names, quant_config) - - format = args.export_format - if args.fp_layers != "": - fp_layers = args.fp_layers.replace(" ", "").split(",") - for n, m in model.named_modules(): - if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)): - continue - for fp_layer in fp_layers: - if fp_layer in n: - quant_config.set_local(n, AutoRoundConfig(dtype="fp32")) - print( - f"{n} will not be quantized.") - - for n, m in model.named_modules(): - if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): - if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - quant_config.set_local(n, AutoRoundConfig(dtype="fp32")) - print( - f"{n} will not be quantized due to its shape not being divisible by 32," - " resulting in an exporting issue to autogptq") - - lm_head_layer_name = "lm_head" - for n, _ in model.named_modules(): - lm_head_layer_name = n - - if args.quant_lm_head: - config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code) - if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"): - tied_keys = model._tied_weights_keys - for item in tied_keys: - if lm_head_layer_name in item: ##TODO extend to encoder-decoder layer, seq classification model - args.quant_lm_head = False - print( - f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been " - f"supported currently") - break - - if not args.quant_lm_head: - quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32")) - else: - if "auto_round" not in format: - raise ValueError( - f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}") - - if args.quant_lm_head and args.low_gpu_mem_usage: - print(f"warning, low_gpu_mem_usage=False is strongly recommended if the whole model could be loaded to " - f"gpu") - - if "--truncation" not in sys.argv: - args.truncation = None - - user_model = prepare(model=model, quant_config=quant_config) - run_fn(user_model, dataloader) - user_model = convert(user_model) + + model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager') model.eval() - if args.device != "cpu": - torch.cuda.empty_cache() - - from neural_compressor.torch.utils import (SaveLoadFormat,) - kargs = {} - if "phi3_v" in model_type: - kargs['safe_serialization'] = 'False' - user_model.save(args.output_dir, format=SaveLoadFormat.HUGGINGFACE, **kargs) - if tokenizer is not None: - tokenizer.save_pretrained(args.output_dir) - if processor is not None and hasattr(processor, 'chat_template'): # Avoiding phi-3.5-vision save errors - processor.save_pretrained(args.output_dir) - - - -def setup_mllm_eval_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", "--model_name", "--model_name_or_path", - help="model name or path") - parser.add_argument("--tasks", type=str, - default="MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE", - help="eval tasks for VLMEvalKit.") - # Args that only apply to Video Dataset - parser.add_argument("--nframe", type=int, default=8, - help="the number of frames to sample from a video," - " only applicable to the evaluation of video benchmarks.") - parser.add_argument("--pack", action='store_true', - help="a video may associate with multiple questions, if pack==True," - " will ask all questions for a video in a single") - parser.add_argument("--fps", type=float, default=-1, - help="set the fps for a video.") - # Work Dir - # Infer + Eval or Infer Only - parser.add_argument("--mode", type=str, default='all', choices=['all', 'infer'], - help="when mode set to 'all', will perform both inference and evaluation;" - " when set to 'infer' will only perform the inference.") - parser.add_argument('--eval_data_dir', type=str, default=None, - help='path for VLMEvalKit to store the eval data. Default will store in ~/LMUData') - # API Kwargs, Apply to API VLMs and Judge API LLMs - parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs') - # Explicitly Set the Judge Model - parser.add_argument('--judge', type=str, default=None, - help="whether is a judge model.") - # Logging Utils - parser.add_argument('--verbose', action='store_true', - help="whether to display verbose information.") - # Configuration for Resume - # Ignore: will not rerun failed VLM inference - parser.add_argument('--ignore', action='store_true', - help='ignore failed indices. ') - # Rerun: will remove all evaluation temp files - parser.add_argument('--rerun', action='store_true', - help="if true, will remove all evaluation temp files and rerun.") - parser.add_argument("--output_dir", default="./eval_result", type=str, - help="the directory to save quantized model") - args = parser.parse_args() - return args - - -def setup_lmms_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", "--model_name", "--model_name_or_path", - help="model name or path") - parser.add_argument( - "--tasks", - default="pope,textvqa_val,scienceqa,mmbench_en", - help="To get full list of tasks, use the command lmms-eval --tasks list", - ) - parser.add_argument("--output_dir", default="./eval_result", type=str, - help="the directory to save quantized model") - parser.add_argument( - "--num_fewshot", - type=int, - default=None, - help="Number of examples in few-shot context", - ) - parser.add_argument( - "--batch_size", - "-b", - type=str, - default=1, - metavar="auto|auto:N|N", - help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.", - ) - parser.add_argument( - "--max_batch_size", - type=int, - default=None, - metavar="N", - help="Maximal batch size to try with --batch_size auto.", - ) - parser.add_argument( - "--device", - type=str, - default=None, - help="Device to use (e.g. cuda, cuda:0, cpu)", - ) - parser.add_argument( - "--limit", - type=float, - default=None, - help="Limit the number of examples per task. " "If <1, limit is a percentage of the total" - " number of examples.", - ) - args = parser.parse_args() - return args - - -def mllm_eval(args): - if isinstance(args.tasks, str): - args.tasks = args.tasks.replace(' ', '').split(',') - from neural_compressor.torch.algorithms.weight_only.autoround import mllm_eval - mllm_eval( - args.model, - work_dir=args.output_dir, - data_store_dir=args.eval_data_dir, - dataset=args.tasks, - pack=args.pack, - fps=args.fps, - nframe=args.nframe, - rerun=args.rerun, - judge=args.judge, - verbose=args.verbose, - mode=args.mode, - ignore=args.ignore - ) - -def lmms_eval(args): - from neural_compressor.torch.algorithms.weight_only.autoround import lmms_eval - results = lmms_eval( - model=args.model, - tasks=args.tasks, - output_dir=args.output_dir, - num_fewshot=args.num_fewshot, - limit=args.limit, - batch_size=args.batch_size, - max_batch_size=args.max_batch_size, - device=args.device, - use_cache=None, - apply_chat_template=False, - ) - return results - + kwargs = {} + kwargs['safe_serialization'] = 'False' # for phi3 saving model + model.save_pretrained(args.output_dir, safe_serialization=False) if __name__ == '__main__': if "--quantize" in sys.argv: args = setup_parser() tune(args) - elif "--accuracy" in sys.argv: - sys.argv.remove("--accuracy") - from neural_compressor.torch.quantization import load - if "--lmms" in sys.argv: - sys.argv.remove("--lmms") - args = setup_lmms_parser() - lmms_eval(args) - else: - if "--mllm_eval" in sys.argv: - sys.argv.remove("--mllm_eval") - args = setup_mllm_eval_parser() - mllm_eval(args) + elif "--inference" in sys.argv: + sys.argv.remove("--inference") + from transformers import AutoProcessor + import requests + from PIL import Image + + args = setup_parser() + model_name = args.model + if model_name[-1] == "/": + model_name = model_name[:-1] + + # Preparation for inference + model = AutoModelForCausalLM.from_pretrained( + args.output_dir, + trust_remote_code=True, + attn_implementation='eager', + torch_dtype=torch.float16, + ) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + + content = "Describe this image." + + messages = [ + {"role": "user", + "content": "<|image_1|>\n"+content}, + ] + + prompt = processor.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + image_inputs = Image.open(requests.get(image_url, stream=True).raw) + inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device) + generation_args = { + "max_new_tokens": 50, + "temperature": 0.0, + "do_sample": False, + } + + generate_ids = model.generate(**inputs, + eos_token_id=processor.tokenizer.eos_token_id, + **generation_args + ) + + # remove input tokens + generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] + response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + print(response) + diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt new file mode 100644 index 00000000000..733f9388bfc --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt @@ -0,0 +1,18 @@ +transformers==4.47.0 +torch +tiktoken +transformers_stream_generator +peft +sentencepiece +einops +accelerate +datasets +protobuf +auto-gptq +openpyxl +wandb +py-cpuinfo +Pillow +torchvision +setuptools +auto-round diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh new file mode 100644 index 00000000000..24e52d179ca --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=50 + batch_size=8 + tuned_checkpoint=saved_results + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --inference --iters "${iters} + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --load" + fi + echo $extra_cmd + + if [ "${topology}" = "phi3_vlm_128k_autoround_int4" ]; then + model_name_or_path="microsoft/Phi-3-vision-128k-instruct" + fi + + if [[ ${mode} == "performance" ]]; then + python -u mllm.py \ + --model ${model_name_or_path} \ + --output_dir ${tuned_checkpoint} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} + fi + +} + +main "$@" diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh new file mode 100644 index 00000000000..e24e544ed75 --- /dev/null +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "phi3_vlm_128k_autoround_int4" ]; then + model_name_or_path="microsoft/Phi-3-vision-128k-instruct" + fi + + python -u mllm.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --iters ${iters} \ + --output_dir ${tuned_checkpoint} \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@"