Support load WOQ model from HF model hub (#1439)

Co-authored-by: Wenxin Zhang <wenxin.zhang@intel.com> Co-authored-by: changwangss <chang1.wang@intel.com>
intel · Apr 2, 2024 · 1065d81 · 1065d81
1 parent 598064a
commit 1065d81
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 77 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1389,15 +1389,15 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "llama2_7b_int4_gptq",
+        "topology": "llama2_7b_gptq",
         "task": "generation",
         "output_model": "saved_results"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "llama2_7b_int4_gptq",
+        "topology": "llama2_7b_gptq",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",
@@ -1581,7 +1581,7 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
         "backend": "neuralspeed",
         "output_model": "saved_results"
@@ -1590,14 +1590,15 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
         "batch_size": "112",
         "iters": "100",
         "int8": "false",
-        "config": "saved_results"
+        "config": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     }
   },
@@ -1606,16 +1607,17 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
         "backend": "neuralspeed",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
         "mode": "benchmark",
         "backend": "neuralspeed",
@@ -1632,16 +1634,17 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_rtn",
+        "topology": "mistral_7b_rtn",
         "task": "generation",
         "backend": "neuralspeed",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_rtn",
+        "topology": "mistral_7b_rtn",
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
@@ -1657,15 +1660,16 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",
@@ -1680,15 +1684,16 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",
@@ -1704,15 +1709,16 @@
     "tune":{
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "mistral_7b_int4_rtn",
+        "topology": "mistral_7b_rtn",
         "task": "generation",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_rtn",
+        "topology": "mistral_7b_rtn",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",
@@ -1724,19 +1730,11 @@
   },
   "mistral_7b_autoround_neuralspeed_hf": {
     "working_dir": "huggingface/pytorch/text-generation/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "mistral_7b_int4_autoround",
-        "task": "generation",
-        "backend": "neuralspeed",
-        "output_model": "saved_results"
-      }
-    },
+    "tune":{},
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
@@ -1754,7 +1752,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
         "mode": "benchmark",
         "backend": "neuralspeed",
@@ -1772,7 +1770,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_autoround",
+        "topology": "mistral_7b_autoround",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",
@@ -1789,7 +1787,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "mistral_7b_int4_gptq",
+        "topology": "mistral_7b_gptq",
         "task": "generation",
         "mode": "benchmark",
         "batch_size": "112",

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -106,7 +106,7 @@ function run_benchmark {
         model_name_or_path="bigscience/bloomz-3b"
     elif [ "${topology}" = "llama_7b" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-chat-hf"
-    elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then
+    elif [ "${topology}" = "llama2_7b_gptq" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
     elif [ "${topology}" = "llama_13b" ]; then
         model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
@@ -157,11 +157,11 @@ function run_benchmark {
     elif [ "${topology}" = "phi_1_5b" ]; then
         model_name_or_path="susnato/phi-1_5_dev"
 	    pip install transformers==4.36.1
-    elif [ "${topology}" = "llama2_7b_int4_gptq" ] && [ "$model_source" != "huggingface" ]; then
+    elif [ "${topology}" = "llama2_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
-    elif [ "${topology}" = "mistral_7b_int4_autoround" ] && [ "$model_source" != "huggingface" ]; then
+    elif [ "${topology}" = "mistral_7b_autoround" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
-    elif [ "${topology}" = "mistral_7b_int4_rtn" ] && [ "$model_source" != "huggingface" ]; then
+    elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
     fi
 
@@ -176,57 +176,45 @@ function run_benchmark {
             extra_cmd=$extra_cmd" --load_in_8bit "
         elif [ "${topology}" = "gpt_j_mp" ]; then
             extra_cmd=$extra_cmd" --mixed_precision"
-        elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then
+        elif [ "${topology}" = "llama2_7b_gptq" ]; then
             if [[ "$model_source" == "huggingface" ]]; then
                 model_name_or_path="TheBloke/Llama-2-7B-Chat-GPTQ"
             else
                 model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
                 extra_cmd=$extra_cmd" --trust_remote_code"
                 extra_cmd=$extra_cmd" --woq_loading"
             fi
-            if [[ $backend == "neuralspeed" ]]; then
-                extra_cmd=$extra_cmd" --use_neural_speed"
-            fi
-        elif [ "${topology}" = "mistral_7b_int4_autoround" ]; then
+        elif [ "${topology}" = "mistral_7b_autoround" ]; then
             if [[ "$model_source" == "huggingface" ]]; then
                 model_name_or_path="Intel/Mistral-7B-v0.1-int4-inc"
             else
                 model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
                 extra_cmd=$extra_cmd" --trust_remote_code"
                 extra_cmd=$extra_cmd" --woq_loading"
-            fi
-            if [[ $backend == "neuralspeed" ]]; then
-                extra_cmd=$extra_cmd" --use_neural_speed"
-            fi
-
-        elif [ "${topology}" = "mistral_7b_int4_rtn" ]; then
+            fi            
+        elif [ "${topology}" = "mistral_7b_rtn" ]; then
             if [[ "$model_source" == "huggingface" ]]; then
                 model_name_or_path="mistralai/Mistral-7B-v0.1"
             else
                 model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
                 extra_cmd=$extra_cmd" --trust_remote_code"
                 extra_cmd=$extra_cmd" --woq_loading"
-            fi
-            if [[ $backend == "neuralspeed" ]]; then
-                extra_cmd=$extra_cmd" --use_neural_speed"
-            fi
-
-        elif [ "${topology}" = "mistral_7b_int4_gptq" ]; then
+            fi            
+        elif [ "${topology}" = "mistral_7b_gptq" ]; then
             if [[ "$model_source" == "huggingface" ]]; then
                 model_name_or_path="TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
             else
                 model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
                 extra_cmd=$extra_cmd" --trust_remote_code"
                 extra_cmd=$extra_cmd" --woq_loading"
             fi
-            if [[ $backend == "neuralspeed" ]]; then
-                extra_cmd=$extra_cmd" --use_neural_speed"
-            fi
         else
             extra_cmd=$extra_cmd" --int8"
         fi
     fi
-
+    if [[ $backend == "neuralspeed" ]]; then
+        extra_cmd=$extra_cmd" --use_neural_speed"
+    fi
     echo $extra_cmd
 
     if [ "${script}" == "run_generation.py" ];then

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
@@ -19,6 +19,7 @@ function init_params {
   approach="PostTrainingStatic"
   script="run_generation.py"
   alpha=0.5
+  weight_dtype="int4_clip"
   for var in "$@"
   do
     case $var in
@@ -40,8 +41,8 @@ function init_params {
        --approach=*)
            approach=$(echo $var |cut -f2 -d=)
        ;;
-       --backend=*)
-           backend=$(echo $var |cut -f2 -d=)
+       --weight_dtype=*)
+           weight_dtype=$(echo $var |cut -f2 -d=)
        ;;
       *)
           echo "Error: No such parameter: ${var}"
@@ -203,39 +204,34 @@ function run_tuning {
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
 	    pip install transformers==4.36.1
-    elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then
+    elif [ "${topology}" = "llama2_7b_gptq" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq --bits 4 --weight_dtype int4_clip --compute_dtype fp32 --scheme asym "
+        extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
         extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 "
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
-    elif [ "${topology}" = "mistral_7b_int4_autoround" ]; then
+        extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
+    elif [ "${topology}" = "mistral_7b_autoround" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
-        extra_cmd=$extra_cmd" --woq --weight_dtype int4_clip --bits 4 --compute_dtype fp32 --scheme asym "
+        extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
         extra_cmd=$extra_cmd" --woq_algo "AutoRound" --desc_act --blocksize 128 --max_input_length 2048 "
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
-        if [[ $backend == "neuralspeed" ]]; then
-            extra_cmd=$extra_cmd" --use_neural_speed"
-        fi
-    elif [ "${topology}" = "mistral_7b_int4_rtn" ]; then
+        extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
+    elif [ "${topology}" = "mistral_7b_rtn" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
-        extra_cmd=$extra_cmd" --woq --weight_dtype int4_clip --bits 4 -compute_dtype fp32 --scheme asym "
+        extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym "
         extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 "
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
-        if [[ $backend == "neuralspeed" ]]; then
-            extra_cmd=$extra_cmd" --use_neural_speed"
-        fi
-    elif [ "${topology}" = "mistral_7b_int4_gptq" ]; then
+        extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
+    elif [ "${topology}" = "mistral_7b_gptq" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
-        extra_cmd=$extra_cmd" --woq --weight_dtype int4_clip --bits 4 --compute_dtype fp32 --scheme asym "
+        extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
         extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 "
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
-        if [[ $backend == "neuralspeed" ]]; then
-            extra_cmd=$extra_cmd" --use_neural_speed"
-        fi
+        extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
     fi
 
     if [ ${script} = "run_generation.py" ];then