From e2e6e5d77c8093a60b34ff289d139adbafeb803d Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 17 Dec 2023 14:54:03 -0800 Subject: [PATCH 01/10] fix extension issue Signed-off-by: changwangss --- .../pytorch/code-generation/quantization/README.md | 12 ++++++------ .../code-generation/quantization/requirements.txt | 1 + .../code-generation/quantization/run_generation.py | 2 ++ .../text-generation/quantization/requirements.txt | 6 +++++- .../text-generation/quantization/run_benchmark.sh | 2 ++ .../text-generation/quantization/run_tuning.sh | 2 ++ 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md index 353cf5134c4..19e27bab7a4 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/README.md +++ b/examples/huggingface/pytorch/code-generation/quantization/README.md @@ -83,7 +83,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" # mixedprecision python run_generation.py \ --model bigcode/starcoder \ @@ -94,7 +94,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. python run_generation.py \ @@ -108,7 +108,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" # weightonlyquant python run_generation.py \ --model bigcode/starcoder \ @@ -120,7 +120,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" # load_in_4bit python run_generation.py \ --model bigcode/starcoder \ @@ -131,7 +131,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" # load_in_8bit python run_generation.py \ --model bigcode/starcoder \ @@ -142,7 +142,7 @@ python run_generation.py \ --allow_code_execution \ --temperature 0.2 \ --do_sample \ - --tasks "humaneval" \ + --tasks "humaneval" ``` >Note: diff --git a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt index b1c2ab59734..9697a3cd8c9 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt @@ -7,6 +7,7 @@ sentencepiece != 0.1.92 torch==2.1.0+cpu peft==0.6.2 transformers >= 4.35.0 +tiktoken #code_gen neural-compressor intel_extension_for_pytorch git+https://github.com/huggingface/optimum.git@927e94739447b13f7eefe085c8d3662654b6a11c diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py index c6d3f46a0d4..0df4e3bdbfd 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py @@ -137,7 +137,9 @@ args.model, truncation_side="left", padding_side="right", + trust_remote_code=args.trust_remote_code ) + config = AutoConfig.from_pretrained( args.model, torchscript=True diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt index 0539924e38e..32b37974a47 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt @@ -5,8 +5,12 @@ protobuf sentencepiece != 0.1.92 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.1.0+cpu -transformers==4.34.1 +transformers==4.35.2 intel_extension_for_pytorch +bitsandbytes #baichuan +transformers_stream_generator +tiktoken #qwen +einops #qwen git+https://github.com/intel/neural-compressor.git git+https://github.com/huggingface/optimum-intel.git@f95dea1ae8966dee4d75d622e7b2468c514ba02d git+https://github.com/huggingface/optimum.git@927e94739447b13f7eefe085c8d3662654b6a11c diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 1fc6e9bbb09..d91cf857332 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -146,8 +146,10 @@ function run_benchmark { model_name_or_path="Intel/neural-chat-7b-v3" elif [ "${topology}" = "phi_1b" ]; then model_name_or_path="susnato/phi-1_dev" + pip install transformers==4.36.1 elif [ "${topology}" = "phi_1_5b" ]; then model_name_or_path="susnato/phi-1_5_dev" + pip install transformers==4.36.1 fi if [[ ${int8} == "true" ]]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index e11e657ac07..a4c36f00c55 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -193,12 +193,14 @@ function run_tuning { extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" + pip install transformers==4.36.1 elif [ "${topology}" = "phi_1_5b" ]; then alpha=0.5 model_name_or_path="susnato/phi-1_5_dev" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" + pip install tranformers==4.36.1 fi if [ ${script} = "run_generation.py" ];then From 4d2fee9b67065e29c72368f7022229f4c3c19c4a Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 19 Dec 2023 02:28:24 -0800 Subject: [PATCH 02/10] fix qwen,falcon,baichuan Signed-off-by: changwangss --- .../quantization/run_generation.py | 12 ++++++------ .../quantization/run_benchmark.sh | 2 +- .../quantization/run_generation.py | 16 ++++++++-------- .../text-generation/quantization/run_tuning.sh | 2 +- .../llm/evaluation/lm_eval/models/huggingface.py | 3 ++- .../transformers/modeling/modeling_auto.py | 5 +++-- .../transformers/utils/utility.py | 5 +++-- 7 files changed, 24 insertions(+), 21 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py index 0df4e3bdbfd..3e082f0b146 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py @@ -28,7 +28,7 @@ "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" ) parser.add_argument("--trust_remote_code", default=False) -parser.add_argument("--revision", default="main", type=str) +parser.add_argument("--_commit_hash", default="main", type=str) parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") parser.add_argument("--dtype", type=str, default="int8") parser.add_argument( @@ -151,7 +151,7 @@ else False, # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, ) if not tokenizer.eos_token: if tokenizer.bos_token: @@ -208,7 +208,7 @@ args.model, quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) elif args.load_in_4bit or args.load_in_8bit: @@ -217,7 +217,7 @@ args.model, load_in_4bit=args.load_in_4bit, load_in_8bit=args.load_in_8bit, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) elif not args.int8 and not args.int8_bf16_mixed: @@ -225,7 +225,7 @@ args.model, config=config, trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) @@ -250,7 +250,7 @@ args.output_dir, file_name="best_model.pt", trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, ) if args.benchmark: diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index d91cf857332..23359b93c03 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -129,7 +129,7 @@ function run_benchmark { elif [ "${topology}" = "baichuan_13b" ]; then model_name_or_path="baichuan-inc/Baichuan-13B-Base" extra_cmd=$extra_cmd" --trust_remote_code True" - extra_cmd=$extra_cmd" --revision 14d5b0e204542744900f6fb52422c6d633bdcb00" + extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00" pip install transformers==4.33 elif [ "${topology}" = "baichuan2_7b" ]; then model_name_or_path="baichuan-inc/Baichuan2-7B-Base" diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 3b5150694d9..ed70804bc65 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -109,7 +109,7 @@ # ============AutoModel parameters============== parser.add_argument("--load_in_4bit", type=bool, default=False) parser.add_argument("--load_in_8bit", type=bool, default=False) -parser.add_argument("--revision", default="main", type=str) +parser.add_argument("--_commit_hash", default="main", type=str) parser.add_argument("--trust_remote_code", default=False) # ======================================= args = parser.parse_args() @@ -139,7 +139,7 @@ else False, # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, ) # chatglm @@ -225,7 +225,7 @@ args.model, quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) elif args.load_in_4bit or args.load_in_8bit: @@ -234,7 +234,7 @@ args.model, load_in_4bit=args.load_in_4bit, load_in_8bit=args.load_in_8bit, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) elif (not args.int8 and not args.int8_bf16_mixed) or args.restore: @@ -242,7 +242,7 @@ user_model = AutoModelForCausalLM.from_pretrained( args.peft_model_id, trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) else: @@ -250,7 +250,7 @@ args.model, config=config, trust_remote_code=args.trust_remote_code, - revision=args.revision, + _commit_hash=args._commit_hash, use_llm_runtime=False, ) @@ -359,8 +359,8 @@ + ",tokenizer=" + args.model + ",dtype=float32" - + ",revision=" - + args.revision + + ",_commit_hash=" + + args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code), user_model=user_model, diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index a4c36f00c55..c11194a95f7 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -159,7 +159,7 @@ function run_tuning { extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" - extra_cmd=$extra_cmd" --revision 14d5b0e204542744900f6fb52422c6d633bdcb00" + extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00" pip install transformers==4.33 elif [ "${topology}" = "baichuan2_7b" ]; then alpha=0.85 diff --git a/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py b/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py index 119114e92a7..32a6da62f0e 100644 --- a/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py +++ b/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py @@ -115,7 +115,8 @@ def __init__( bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None, bnb_4bit_use_double_quant: Optional[bool] = False, init_empty_weights: Optional[bool] = False, - model_format: Optional[str] = "torch" + model_format: Optional[str] = "torch", + _commit_hash: Optional[str] = None ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation. Args: diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index bea47fe6957..9aea240b6a9 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -429,12 +429,13 @@ def calib_func(model): "position_ids": inputs["position_ids"], "past_key_values": inputs["past_key_values"], } - elif model_type == "falcon": + elif model_type == "falcon" or model_type=="qwen": + example_inputs = inputs input_bs, input_len = inputs["input_ids"].shape outputs = model(inputs["input_ids"]) example_inputs["past_key_values"] = outputs[1] example_inputs["attention_mask"] = torch.ones( - input_bs, input_len + input_bs, input_len + 1 ) example_inputs["position_ids"] = ( inputs["position_ids"][:, -1:] + 1 diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 569e1276ad1..b7791196b9a 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -196,7 +196,8 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): "llama", "mistral", "chatglm", - "falcon" + "falcon", + "qwen" } def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): @@ -271,4 +272,4 @@ def recover_model_from_json(user_model, json_file_path, trust_remote_code=False) # pylint: disable=E0611 from neural_compressor.utils.pytorch import recover_model_from_json as inc_recover_model_from_json user_model = inc_recover_model_from_json(user_model, json_file_path, example_inputs) - return user_model \ No newline at end of file + return user_model From 4e174d255d219bd5ccd4509394fff3fd951e0876 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 19 Dec 2023 03:45:05 -0800 Subject: [PATCH 03/10] fix qwen Signed-off-by: changwangss --- .../pytorch/text-generation/quantization/run_benchmark.sh | 1 + .../pytorch/text-generation/quantization/run_tuning.sh | 3 ++- intel_extension_for_transformers/transformers/utils/utility.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 23359b93c03..1b500831866 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -142,6 +142,7 @@ function run_benchmark { elif [ "${topology}" = "qwen_7b" ]; then model_name_or_path="Qwen/Qwen-7B" extra_cmd=$extra_cmd" --trust_remote_code True" + pip install transformers==4.33 elif [ "${topology}" = "mistral_7b" ]; then model_name_or_path="Intel/neural-chat-7b-v3" elif [ "${topology}" = "phi_1b" ]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index c11194a95f7..8ab1a70c15f 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -145,7 +145,7 @@ function run_tuning { model_name_or_path="tiiuae/falcon-7b-instruct" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - pip install transformers==4.33 + pip install transformers==4.33 elif [ "${topology}" = "baichuan_7b" ]; then alpha=0.85 model_name_or_path="baichuan-inc/Baichuan-7B" @@ -181,6 +181,7 @@ function run_tuning { extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" + pip install transformers==4.33 elif [ "${topology}" = "mistral_7b" ]; then alpha=0.8 model_name_or_path="Intel/neural-chat-7b-v3" diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index b7791196b9a..b9dc00251a4 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -196,7 +196,6 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): "llama", "mistral", "chatglm", - "falcon", "qwen" } From 7f7fc988d7073f4a379594652902c6d608c83c41 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 19 Dec 2023 12:56:01 -0800 Subject: [PATCH 04/10] fix falcon and offline validate with transformers 4.33 Signed-off-by: changwangss --- .../text-generation/quantization/run_benchmark.sh | 2 +- .../pytorch/text-generation/quantization/run_tuning.sh | 2 +- .../llm/evaluation/models.py | 5 +---- .../transformers/modeling/modeling_auto.py | 9 ++++----- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 1b500831866..139e372c09c 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -142,7 +142,7 @@ function run_benchmark { elif [ "${topology}" = "qwen_7b" ]; then model_name_or_path="Qwen/Qwen-7B" extra_cmd=$extra_cmd" --trust_remote_code True" - pip install transformers==4.33 + pip install transformers==4.33.3 elif [ "${topology}" = "mistral_7b" ]; then model_name_or_path="Intel/neural-chat-7b-v3" elif [ "${topology}" = "phi_1b" ]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 8ab1a70c15f..332b99c7917 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -145,7 +145,7 @@ function run_tuning { model_name_or_path="tiiuae/falcon-7b-instruct" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - pip install transformers==4.33 + pip install transformers==4.33.3 elif [ "${topology}" = "baichuan_7b" ]; then alpha=0.85 model_name_or_path="baichuan-inc/Baichuan-7B" diff --git a/intel_extension_for_transformers/llm/evaluation/models.py b/intel_extension_for_transformers/llm/evaluation/models.py index 1cb03b7108b..02fdc723538 100644 --- a/intel_extension_for_transformers/llm/evaluation/models.py +++ b/intel_extension_for_transformers/llm/evaluation/models.py @@ -166,9 +166,7 @@ def forward( input_bs, input_len = input_ids.shape if self.use_cache and past_key_values is None: if model_type in IPEX_OPT_LLM_SUPPORTED: - if (model_type == "falcon" and transformers.__version__ > "4.33") or ( - model_type == "llama" and transformers.__version__ >= "4.36" - ): + if model_type == "llama" and transformers.__version__ >= "4.36": past_key_values = generate_dummy_past_key_values( config=self.config, input_bs=input_bs ) @@ -195,7 +193,6 @@ def forward( inputs["position_ids"] = position_ids else: inputs["position_ids"] = torch.arange(input_len).repeat(input_bs, 1) - outputs = self.model(**inputs) if isinstance(outputs, (list, tuple)): diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 9aea240b6a9..4fb630a6eeb 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -228,11 +228,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model = model.float() model.eval() model_type = model.config.model_type.replace("_", "-") - if "falcon" in model_type and transformers.__version__ > "4.33": - ipex.nn.utils._model_convert.replace_customized_linear_with_linear( - model.eval() + if "falcon" in model_type: + logger.warning( + "Please use transformers 4.33.3 if you would like to apply smoothquant to Falcon." ) - quantization_config.ipex_opt_llm = False if "llama" in model_type and transformers.__version__ >= "4.36.0": quantization_config.ipex_opt_llm = False logger.info("Applying SmoothQuant.") @@ -429,7 +428,7 @@ def calib_func(model): "position_ids": inputs["position_ids"], "past_key_values": inputs["past_key_values"], } - elif model_type == "falcon" or model_type=="qwen": + elif model_type == "falcon" or model_type == "qwen": example_inputs = inputs input_bs, input_len = inputs["input_ids"].shape outputs = model(inputs["input_ids"]) From b93474762c0e60eb7307330ba39ba38e81f20552 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 20 Dec 2023 15:14:43 -0800 Subject: [PATCH 05/10] fix bloom generate and qwen version Signed-off-by: changwangss --- .../quantization/run_benchmark.sh | 2 +- .../quantization/run_generation.py | 1 - .../quantization/run_tuning.sh | 2 +- .../transformers/modeling/modeling_auto.py | 17 +---- .../transformers/utils/utility.py | 74 ++++++++++++++++++- 5 files changed, 77 insertions(+), 19 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 139e372c09c..df6a6dc4982 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -142,7 +142,7 @@ function run_benchmark { elif [ "${topology}" = "qwen_7b" ]; then model_name_or_path="Qwen/Qwen-7B" extra_cmd=$extra_cmd" --trust_remote_code True" - pip install transformers==4.33.3 + pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then model_name_or_path="Intel/neural-chat-7b-v3" elif [ "${topology}" = "phi_1b" ]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 30a6d85a25d..03c8878ee1f 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -266,7 +266,6 @@ load_in_4bit=args.load_in_4bit, load_in_8bit=args.load_in_8bit, _commit_hash=args._commit_hash, - use_llm_runtime=False, use_llm_runtime=args.use_llm_runtime, ) elif (not args.int8 and not args.int8_bf16_mixed) or args.restore: diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 332b99c7917..32791d1c867 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -181,7 +181,7 @@ function run_tuning { extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" - pip install transformers==4.33 + pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then alpha=0.8 model_name_or_path="Intel/neural-chat-7b-v3" diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 2c442742079..d23d404aa8c 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -333,7 +333,10 @@ def collate_batch(batch): ) last_ind.append(input_ids.shape[0] - 1) - attention_mask = torch.ones(len(input_ids)) + if model_type in ["bloom", "qwen", "baichuan"]: + attention_mask = torch.ones(len(input_ids) +1) + else: + attention_mask = torch.ones(len(input_ids)) position_ids = torch.arange(len(input_ids)) input_ids_padded.append(input_ids) attention_mask_padded.append(attention_mask) @@ -449,18 +452,6 @@ def calib_func(model): "position_ids": inputs["position_ids"], "past_key_values": inputs["past_key_values"], } - elif model_type == "falcon" or model_type == "qwen": - example_inputs = inputs - input_bs, input_len = inputs["input_ids"].shape - outputs = model(inputs["input_ids"]) - example_inputs["past_key_values"] = outputs[1] - example_inputs["attention_mask"] = torch.ones( - input_bs, input_len + 1 - ) - example_inputs["position_ids"] = ( - inputs["position_ids"][:, -1:] + 1 - ) - example_inputs["input_ids"] = inputs["input_ids"][:, -1:] else: example_inputs = inputs else: diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index b9dc00251a4..dda1c4657fc 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -90,6 +90,76 @@ def generate_dummy_past_key_values(config, input_bs): """ from optimum.utils import NormalizedConfigManager + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + + if config.model_type == "bloom": + shape_key = (input_bs * num_attention_heads, d_k, 1) + shape_value = (input_bs * num_attention_heads, 1, d_k) + key = torch.ones(size=shape_key) + value = torch.ones(size=shape_value) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) + for _ in range(num_layers) + ) + return past_key_values + elif config.model_type == "gpt_bigcode": + new_shape = [input_bs, 0, d_k * 2] + dummy_tensor = torch.zeros(size=new_shape) + past_key_values = tuple([dummy_tensor] * num_layers) + return past_key_values + elif config.model_type == "qwen": + new_shape = [input_bs, 1, num_key_value_heads, d_k] + past_key_values = [ + ( + torch.ones(size=new_shape).contiguous(), + torch.ones(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + elif config.model_type == "baichuan": + new_shape = [input_bs, num_key_value_heads, 1, d_k] + past_key_values = [ + ( + torch.ones(size=new_shape).contiguous(), + torch.ones(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + elif config.model_type == "chatglm": + new_shape = [0, input_bs, num_key_value_heads, d_k] + elif config.model_type == "falcon": + new_shape = [input_bs, 1, 0, d_k] + else: + new_shape = [input_bs, num_key_value_heads, 0, d_k] + past_key_values = [ + ( + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +def generate_dummy_past_key_values_for_inference(config, input_bs): + """ + Generate the dummy past_key_values. + """ + from optimum.utils import NormalizedConfigManager + normalized_config = NormalizedConfigManager.get_normalized_config_class( config.model_type )(config) @@ -136,7 +206,6 @@ def generate_dummy_past_key_values(config, input_bs): ] return tuple(past_key_values) - def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): """ Generate the dummy past_key_values. @@ -195,8 +264,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): "imagegpt", "llama", "mistral", - "chatglm", - "qwen" + "chatglm" } def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): From 84ea27542b0db2b0d3a67e872e5544b31ced104f Mon Sep 17 00:00:00 2001 From: VincyZhang Date: Thu, 21 Dec 2023 14:09:48 +0800 Subject: [PATCH 06/10] Update requirements.txt Signed-off-by: VincyZhang --- .../pytorch/text-generation/quantization/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt index e9775a77bb6..5e8c7aad0d8 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt @@ -5,7 +5,7 @@ protobuf sentencepiece != 0.1.92 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.1.1+cpu -transformers==4.35.2 +transformers intel_extension_for_pytorch bitsandbytes #baichuan transformers_stream_generator From 7188e90f30b951e70e6d8765fe87d8d04885d0fa Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 20 Dec 2023 16:03:56 -0800 Subject: [PATCH 07/10] fix baichuan Signed-off-by: changwangss --- .../transformers/modeling/modeling_auto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index d23d404aa8c..d91052144a0 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -333,8 +333,9 @@ def collate_batch(batch): ) last_ind.append(input_ids.shape[0] - 1) - if model_type in ["bloom", "qwen", "baichuan"]: + if model_type in ["bloom", "qwen"]: attention_mask = torch.ones(len(input_ids) +1) + attention_mask[0] = 0 else: attention_mask = torch.ones(len(input_ids)) position_ids = torch.arange(len(input_ids)) From 030384f73f0894c1ced6ef73af7cb37d37f70032 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 20 Dec 2023 16:06:58 -0800 Subject: [PATCH 08/10] fix baichuan Signed-off-by: changwangss --- .../llm/evaluation/models.py | 4 ++-- .../transformers/utils/utility.py | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/intel_extension_for_transformers/llm/evaluation/models.py b/intel_extension_for_transformers/llm/evaluation/models.py index 02fdc723538..c8db3ab99f1 100644 --- a/intel_extension_for_transformers/llm/evaluation/models.py +++ b/intel_extension_for_transformers/llm/evaluation/models.py @@ -22,7 +22,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast from optimum.intel.generation.modeling import TSModelForCausalLM from intel_extension_for_transformers.transformers.utils.utility import ( - generate_dummy_past_key_values, + generate_dummy_past_key_values_for_inference, generate_dummy_past_key_values_for_opt_llm, MODEL_TYPES_REQUIRING_POSITION_IDS, IPEX_OPT_LLM_SUPPORTED, @@ -175,7 +175,7 @@ def forward( config=self.config, input_bs=input_bs, num_beams=1 ) else: - past_key_values = generate_dummy_past_key_values( + past_key_values = generate_dummy_past_key_values_for_inference( config=self.config, input_bs=input_bs ) inputs["past_key_values"] = past_key_values diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index dda1c4657fc..4ee0664df49 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -129,16 +129,6 @@ def generate_dummy_past_key_values(config, input_bs): for _ in range(num_layers) ] return tuple(past_key_values) - elif config.model_type == "baichuan": - new_shape = [input_bs, num_key_value_heads, 1, d_k] - past_key_values = [ - ( - torch.ones(size=new_shape).contiguous(), - torch.ones(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) elif config.model_type == "chatglm": new_shape = [0, input_bs, num_key_value_heads, d_k] elif config.model_type == "falcon": From 33a80da2b25e7692f8edfd4d40c680dba7412954 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 20 Dec 2023 17:40:34 -0800 Subject: [PATCH 09/10] fix name Signed-off-by: changwangss --- intel_extension_for_transformers/llm/evaluation/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/llm/evaluation/models.py b/intel_extension_for_transformers/llm/evaluation/models.py index c8db3ab99f1..8ab22d640fa 100644 --- a/intel_extension_for_transformers/llm/evaluation/models.py +++ b/intel_extension_for_transformers/llm/evaluation/models.py @@ -167,7 +167,7 @@ def forward( if self.use_cache and past_key_values is None: if model_type in IPEX_OPT_LLM_SUPPORTED: if model_type == "llama" and transformers.__version__ >= "4.36": - past_key_values = generate_dummy_past_key_values( + past_key_values = generate_dummy_past_key_values_for_inference( config=self.config, input_bs=input_bs ) else: From 726d9520f960a5e263086f7291061aca59f1ec3c Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 20 Dec 2023 18:02:44 -0800 Subject: [PATCH 10/10] add qwen commit Signed-off-by: changwangss --- .../pytorch/text-generation/quantization/run_benchmark.sh | 1 + .../pytorch/text-generation/quantization/run_tuning.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index df6a6dc4982..1c371a29bda 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -142,6 +142,7 @@ function run_benchmark { elif [ "${topology}" = "qwen_7b" ]; then model_name_or_path="Qwen/Qwen-7B" extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97" pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then model_name_or_path="Intel/neural-chat-7b-v3" diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 32791d1c867..d8ab7e588aa 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -181,6 +181,7 @@ function run_tuning { extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97" pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then alpha=0.8