[LLM Runtime] Support load_in_nbit in llm runtime (#688)

* support load_in_nbit in llm runtime Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
intel · Nov 15, 2023 · 4423f70 · 4423f70
1 parent cd40423
commit 4423f70
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -61,32 +61,30 @@ Below is the sample code to enable weight-only INT4/INT8 inference. See more [ex
 ### INT4 Inference 
 ```python
 from transformers import AutoTokenizer, TextStreamer
-from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 model_name = "Intel/neural-chat-7b-v1-1"     # Hugging Face model_id or local model
-config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
 prompt = "Once upon a time, there existed a little girl,"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 inputs = tokenizer(prompt, return_tensors="pt").input_ids
 streamer = TextStreamer(tokenizer)
 
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
+model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 ```
 
 ### INT8 Inference
 ```python
 from transformers import AutoTokenizer, TextStreamer
-from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 model_name = "Intel/neural-chat-7b-v1-1"     # Hugging Face model_id or local model
-config = WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8")
 prompt = "Once upon a time, there existed a little girl,"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 inputs = tokenizer(prompt, return_tensors="pt").input_ids
 streamer = TextStreamer(tokenizer)
 
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
+model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True)
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 ```
 

diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -64,16 +64,15 @@ pip install intel-extension-for-transformers
 You can use Python API to run Hugging Face model simply. Here is the sample code:
 ```python
 from transformers import AutoTokenizer, TextStreamer
-from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 model_name = "Intel/neural-chat-7b-v1-1"     # Hugging Face model_id or local model
-config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
 prompt = "Once upon a time, there existed a little girl,"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 inputs = tokenizer(prompt, return_tensors="pt").input_ids
 streamer = TextStreamer(tokenizer)
 
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
+model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 ```
 

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -101,9 +101,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 torch_dtype = kwargs.pop("torch_dtype", torch.float32)
             if load_in_4bit:
                 if quantization_config is None:
-                    quantization_config = WeightOnlyQuantConfig(
-                        compute_dtype=torch_dtype, weight_dtype="nf4"
-                    )
+                    if use_llm_runtime: 
+                        quantization_config = WeightOnlyQuantConfig(
+                            compute_dtype="int8", weight_dtype="int4"
+                        )
+                    else:
+                        quantization_config = WeightOnlyQuantConfig(
+                            compute_dtype=torch_dtype, weight_dtype="nf4"
+                        )
                 else:
                     assert (
                         "4" in quantization_config.weight_dtype
@@ -112,9 +117,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}."
             elif load_in_8bit:
                 if quantization_config is None:
-                    quantization_config = WeightOnlyQuantConfig(
-                        compute_dtype=torch_dtype, weight_dtype="int8"
-                    )
+                    if use_llm_runtime: 
+                        quantization_config = WeightOnlyQuantConfig(
+                            compute_dtype="bf16", weight_dtype="int8"
+                        )
+                    else:
+                        quantization_config = WeightOnlyQuantConfig(
+                            compute_dtype=torch_dtype, weight_dtype="int8"
+                        )
                 else:
                     assert (
                         quantization_config.weight_dtype == "int8"