[NeuralChat] Support LLM runtime ggml int4 (#1098)

* Support llm runtime ggml int4 Signed-off-by: lvliang-intel <liang1.lv@intel.com>
intel · Jan 1, 2024 · 29bbd80 · 29bbd80
1 parent e6ecb21
commit 29bbd80
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 2 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -911,7 +911,7 @@ def generate_output():
                                 max_new_tokens=max_new_tokens,
                                 ctx_size=max_new_tokens,
                                 ignore_prompt=True,
-                                interactive=True,
+                                interactive=False if "magicoder" in model_name.lower() else True,
                                 do_sample=do_sample,
                                 num_beams=num_beams,
                                 n_keep=2 if "chatglm" in model_name.lower() else 1

diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
@@ -161,6 +161,7 @@ def init(self, config):
             compute_dtype = yaml_config.get("compute_dtype", {})
             weight_dtype = yaml_config.get("weight_dtype", {})
             use_cached_bin = yaml_config.get("use_cached_bin", {})
+            use_ggml = yaml_config.get("use_ggml", False)
             mix_precision_dtype = yaml_config.get("mix_precision_dtype", {})
             load_in_4bit = yaml_config.get("load_in_4bit", {})
             bnb_4bit_quant_type = yaml_config.get("bnb_4bit_quant_type", {})
@@ -172,7 +173,7 @@ def init(self, config):
             from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig
             if optimization_type == "weight_only":
                 optimization_config = WeightOnlyQuantConfig(compute_dtype=compute_dtype, weight_dtype=weight_dtype,
-                                                            use_cache=use_cached_bin)
+                                                            use_ggml=use_ggml, use_cache=use_cached_bin)
             elif optimization_type == "mix_precision":
                 optimization_config = MixedPrecisionConfig(dtype=mix_precision_dtype)
             elif optimization_type == "bits_and_bytes":