[LLM] add chatglm and codellama extension test (#837)

intel · Dec 1, 2023 · 130b594 · 130b594
1 parent 7344717
commit 130b594
Show file tree

Hide file tree

Showing 10 changed files with 165 additions and 20 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -26,6 +26,33 @@
       }
     }
   },
+  "codellama_7b": {
+    "working_dir": "huggingface/pytorch/code-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "codellama_7b",
+        "task": "",
+        "approach": "static",
+        "backend": "ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "codellama_7b",
+        "task": "",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "accuracy",
+        "batch_size": "8",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
   "gpt_neo_clm_static": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune":{
@@ -1711,6 +1738,87 @@
       }
     }
   },
+  "chatglm3_6b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "chatglm3_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "chatglm3_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "chatglm2_6b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "chatglm2_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "chatglm2_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "chatglm_6b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "chatglm_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "chatglm_6b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
   "flan-t5-large_gen_ipex_static": {
     "working_dir": "huggingface/pytorch/text2text-generation",
     "tune": {

diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md
@@ -4,7 +4,7 @@ We provide the inference benchmarking script `run_generation.py` for Starcoder a
 
 # Prerequisite
 ## 1. Environment
-Recommend python 3.7 or higher version is recommended. The dependent packages are listed in requirements, please install them as follows,
+Recommend python version is 3.10 due to [code evaluation library](https://github.com/bigcode-project/bigcode-evaluation-harness) limitation. The dependent packages are listed in requirements, please install them as follows,
 
 ```shell
 git clone https://github.com/intel/intel-extension-for-transformers.git

diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh
@@ -13,7 +13,7 @@ function init_params {
   iters=100
   batch_size=1
   tuned_checkpoint=saved_results
-  lm_eval_tasks="openai_humaneval"
+  lm_eval_tasks="humaneval"
   script="run_generation.py"
   for var in "$@"
   do
@@ -67,10 +67,10 @@ function run_benchmark {
 
     if [[ ${mode} == "accuracy" ]]; then
         mode_cmd=" --accuracy "
-	extra_cmd=$extra_cmd" --tasks ${lm_eval_tasks} --allow_code_execution --n_samples 20 --batch_size 20 --do_sample"
-	batch_size=112
+	    extra_cmd=$extra_cmd" --tasks ${lm_eval_tasks} --allow_code_execution --n_samples 20 --batch_size 20 --do_sample"
     elif [[ ${mode} == "benchmark" ]]; then
         mode_cmd=" --benchmark "
+        extra_cmd=$extra_cmd" --batch_size ${batch_size}"
     else
         echo "Error: No such mode: ${mode}"
         exit 1
@@ -82,6 +82,11 @@ function run_benchmark {
         if [ "${backend}" = "ipex" ]; then
             extra_cmd=$extra_cmd" --ipex"
         fi
+    elif [ "${topology}" = "codellama_7b" ]; then
+        model_name_or_path="codellama/CodeLlama-7b-hf"
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
     fi
 
 
@@ -95,7 +100,6 @@ function run_benchmark {
         python -u ./${script} \
             --model ${model_name_or_path} \
             --output_dir ${tuned_checkpoint} \
-            --batch_size ${batch_size} \
             ${mode_cmd} \
             ${extra_cmd}
     else

diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py
@@ -153,7 +153,6 @@
 tokenizer.pad_token = tokenizer.eos_token
 
 
-calib_dataset = args.dataset
 op_type_dict = {
     "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
 }
@@ -174,7 +173,7 @@
         recipes=recipes,
         op_type_dict=op_type_dict,  # default is {}
         excluded_precisions=excluded_precisions,  # default is []
-        calib_dataset=calib_dataset,
+        calib_dataset=args.dataset,
         calib_iters=args.calib_iters,
     )
 elif args.woq:

diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh
@@ -12,7 +12,7 @@ function main {
 function init_params {
   topology="gpt"
   tuned_checkpoint="saved_results"
-  DATASET_NAME="openai_humaneval"
+  DATASET_NAME="mbpp"
   model_name_or_path="bigcode/starcoder"
   extra_cmd=""
   batch_size=8
@@ -61,15 +61,19 @@ function run_tuning {
             extra_cmd=$extra_cmd" --ipex"
             alpha=0.5
         fi
+    elif [ "${topology}" = "codellama_7b" ]; then
+        model_name_or_path="codellama/CodeLlama-7b-hf"
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+            alpha=0.5
+        fi
     fi
 
     if [ ${script} = "run_generation.py" ];then
         python ./${script} \
             --model ${model_name_or_path} \
             --output_dir ${tuned_checkpoint} \
             --dataset ${DATASET_NAME} \
-            --calib_split "test" \
-            --quantize \
             --sq \
             --alpha ${alpha} \
             ${extra_cmd}

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -5,7 +5,7 @@ We provide the inference benchmarking script `run_generation.py` for large langu
 
 # Prerequisite
 ## 1. Create Environment
-Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
+Pytorch and Intel-extension-for-pytorch version 2.1 are required, python version requests equal or higher than 3.9 due to [text evaluation library](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) limitation, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
 pip install -r requirements.txt

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -109,9 +109,18 @@ function run_benchmark {
         model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
     elif [ "${topology}" = "mpt_7b_chat" ]; then
         model_name_or_path="mosaicml/mpt-7b-chat"
+    elif [ "${topology}" = "chatglm3_6b" ]; then
+        model_name_or_path="THUDM/chatglm3-6b"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+    elif [ "${topology}" = "chatglm2_6b" ]; then
+        model_name_or_path="THUDM/chatglm2-6b"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+    elif [ "${topology}" = "chatglm_6b" ]; then
+        model_name_or_path="THUDM/chatglm-6b"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+        pip install transformers==4.33
     fi
 
-
     if [[ ${int8} == "true" ]]; then
         if [ "${topology}" = "gpt_j_woq_rtn" ]; then
             extra_cmd=$extra_cmd" --woq"

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -155,10 +155,6 @@
         op_type_dict = {".*": {"activation": {"algorithm": "minmax"}}}
     else:
         op_type_dict = {}
-    if re.search("dolly", args.model):
-        ipex_opt_llm = False
-    else:
-        ipex_opt_llm = None
     excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
     recipes = {
         "smooth_quant": True,
@@ -172,7 +168,6 @@
         op_type_dict=op_type_dict,  # default is {}
         excluded_precisions=excluded_precisions,  # default is []
         num_beams=generate_kwargs["num_beams"],
-        ipex_opt_llm=ipex_opt_llm,
     )
 elif args.woq:
     quantization_config = WeightOnlyQuantConfig(

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
@@ -121,6 +121,25 @@ function run_tuning {
         model_name_or_path="mosaicml/mpt-7b-chat"
         extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+    elif [ "${topology}" = "chatglm3_6b" ]; then
+        alpha=0.75
+        model_name_or_path="THUDM/chatglm3-6b"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+    elif [ "${topology}" = "chatglm2_6b" ]; then
+        alpha=0.75
+        model_name_or_path="THUDM/chatglm2-6b"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+    elif [ "${topology}" = "chatglm_6b" ]; then
+        alpha=0.75
+        model_name_or_path="THUDM/chatglm-6b"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        extra_cmd=$extra_cmd" --trust_remote_code True"
+        pip install transformers==4.33
     fi
 
     if [ ${script} = "run_generation.py" ];then

diff --git a/intel_extension_for_transformers/llm/evaluation/models.py b/intel_extension_for_transformers/llm/evaluation/models.py
@@ -54,7 +54,10 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
         past_key_values = past_key_values or kwargs.get("past", None)
 
         if self.use_cache and past_key_values is not None:
-            if not re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]):
+            if not (
+                self.config.model_type == "chatglm"
+                and re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"])
+            ):
                 input_ids = input_ids[:, -1:]
 
         # `past_key_values` may be in the stardard format (e.g. in contrastive search),
@@ -73,7 +76,9 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
 
-        if re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]):
+        if self.config.model_type == "chatglm" and re.search(
+            "THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]
+        ):
             MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
             seqs = input_ids.tolist()
             mask_positions, use_gmasks = [], []
@@ -160,7 +165,9 @@ def forward(
         inputs["past_key_values"] = past_key_values
         if attention_mask is None:
             inputs["attention_mask"] = torch.ones_like(input_ids)
-        if re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]):
+        if model_type == "chatglm" and re.search(
+            "THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]
+        ):
             if position_ids is None:
                 position_ids = self.prepare_inputs_for_generation(input_ids)[
                     "position_ids"