fix

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
intel · May 13, 2024 · 7d5e551 · 7d5e551
1 parent 687e1e5
commit 7d5e551
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -65,11 +65,12 @@ best_model = quant.model
 ### Static Quantization
 
 ```python
-from neural_compressor_ort.quantization import quantize, StaticQuantConfig
-from neural_compressor_ort.quantization.calibrate import CalibrationDataReader
+from neural_compressor_ort import config
+from neural_compressor_ort.quantization import quantize
+from neural_compressor_ort.quantization import calibrate
 
 
-class DataReader(CalibrationDataReader):
+class DataReader(calibrate.CalibrationDataReader):
     def __init__(self):
         self.encoded_list = []
         # append data into self.encoded_list
@@ -84,8 +85,8 @@ class DataReader(CalibrationDataReader):
 
 
 data_reader = DataReader()
-config = StaticQuantConfig(calibration_data_reader=data_reader)
-quantize(model, output_model_path, config)
+qconfig = config.StaticQuantConfig(calibration_data_reader=data_reader)
+quantize(model, output_model_path, qconfig)
 ```
 
 ## Documentation

diff --git a/docs/quantization.md b/docs/quantization.md
@@ -123,28 +123,29 @@ User could execute:
 This means user could leverage ONNX Neural Compressor to directly generate a fully quantized model without accuracy aware tuning. It's user responsibility to ensure the accuracy of the quantized model meets expectation. ONNX Neural Compressor supports `Post Training Static Quantization` and `Post Training Dynamic Quantization`.
 
 ``` python
-from neural_compressor_ort.quantization import StaticQuantConfig, DynamicQuantConfig, quantize
-from neural_compressor_ort.quantization.calibrate import CalibrationDataReader
+from neural_compressor_ort import config
+from neural_compressor_ort.quantization import quantize
+from neural_compressor_ort.quantization import calibrate
 
 
-class DataReader(CalibrationDataReader):
+class DataReader(calibrate.CalibrationDataReader):
     def get_next(self): ...
 
     def rewind(self): ...
 
 
 calibration_data_reader = DataReader()  # only needed by StaticQuantConfig
-config = StaticQuantConfig(calibration_data_reader)  # or config = DynamicQuantConfig()
-quantize(model, q_model_path, config)
+qconfig = config.StaticQuantConfig(calibration_data_reader)  # or qconfig = DynamicQuantConfig()
+quantize(model, q_model_path, qconfig)
 ```
 
 2. With Accuracy Aware Tuning
 
 This means user could leverage the advance feature of ONNX Neural Compressor to tune out a best quantized model which has best accuracy and good performance. User should provide `eval_fn`.
 
 ``` python
-from neural_compressor_ort.base_tuning import Evaluator, TuningConfig
-from neural_compressor_ort.quantization import (
+from neural_compressor_ort.quantization import calibrate
+from neural_compressor_ort.quantization import tuning
     CalibrationDataReader,
     GPTQConfig,
     RTNConfig,
@@ -153,7 +154,7 @@ from neural_compressor_ort.quantization import (
 )
 
 
-class DataReader(CalibrationDataReader):
+class DataReader(calibrate.CalibrationDataReader):
     def get_next(self): ...
 
     def rewind(self): ...
@@ -162,11 +163,11 @@ class DataReader(CalibrationDataReader):
 data_reader = DataReader()
 
 # TuningConfig can accept:
-# 1) a set of candidate configs like TuningConfig(config_set=[RTNConfig(weight_bits=4), GPTQConfig(weight_bits=4)])
-# 2) one config with a set of candidate parameters like TuningConfig(config_set=[GPTQConfig(weight_group_size=[32, 64])])
-# 3) our pre-defined config set like TuningConfig(config_set=get_woq_tuning_config())
-custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_bits=4), GPTQConfig(weight_bits=4)])
-best_model = autotune(
+# 1) a set of candidate configs like tuning.TuningConfig(config_set=[config.RTNConfig(weight_bits=4), config.GPTQConfig(weight_bits=4)])
+# 2) one config with a set of candidate parameters like tuning.TuningConfig(config_set=[config.GPTQConfig(weight_group_size=[32, 64])])
+# 3) our pre-defined config set like tuning.TuningConfig(config_set=config.get_woq_tuning_config())
+custom_tune_config = tuning.TuningConfig(config_set=[config.RTNConfig(weight_bits=4), config.GPTQConfig(weight_bits=4)])
+best_model = tuning.autotune(
     model_input=model,
     tune_config=custom_tune_config,
     eval_fn=eval_fn,
@@ -178,8 +179,8 @@ best_model = autotune(
 ONNX Neural Compressor support specify quantization rules by operator name. Users can use `set_local` API of configs to achieve the above purpose by below code:
 
 ```python
-fp32_config = GPTQConfig(weight_dtype="fp32")
-quant_config = GPTQConfig(
+fp32_config = config.GPTQConfig(weight_dtype="fp32")
+quant_config = config.GPTQConfig(
     weight_bits=4,
     weight_dtype="int",
     weight_sym=False,

diff --git a/docs/smooth_quant.md b/docs/smooth_quant.md
@@ -322,9 +322,9 @@ There are two ways to apply smooth quantization: 1) using a fixed `alpha` for th
 To set a fixed alpha for the entire model, users can follow this example:
 
 ```python
-from neural_compressor_ort.quantization import StaticQuantConfig
+from neural_compressor_ort import config quantization
 
-config = StaticQuantConfig(
+qconfig = config.StaticQuantConfig(
     data_reader, extra_options={"SmoothQuant": True, "SmoothQuantAlpha": 0.5, "SmoothQuantFolding": True}
 )
 ```
@@ -347,10 +347,10 @@ Here is an example:
 from neural_compressor_ort import config
 from neural_compressor_ort.quantization import tuning
 
-config = tuning.TuningConfig(config_set=[config.SmoothQuantConfig(alpha=np.arange(0.1, 0.5, 0.05).tolist())])
+qconfig = tuning.TuningConfig(config_set=[config.SmoothQuantConfig(alpha=np.arange(0.1, 0.5, 0.05).tolist())])
 best_model = tuning.autotune(
     model_input=model,
-    tune_config=config,
+    tune_config=qconfig,
     eval_fn=eval_fn,
     calibration_data_reader=data_reader,
 )
@@ -363,7 +363,7 @@ Here is an example:
 from neural_compressor_ort import config
 from neural_compressor_ort.quantization import quantize
 
-config = config.StaticQuantConfig(
+qconfig = config.StaticQuantConfig(
     data_reader,
     extra_options={
         "SmoothQuant": True,
@@ -377,7 +377,7 @@ config = config.StaticQuantConfig(
         },
     },
 )
-quantize(model, output_model_path, config)
+quantize(model, output_model_path, qconfig)
 ```
 
 ## Reference

diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -112,7 +112,7 @@
 
 # load model
 tokenizer = transformers.LlamaTokenizer.from_pretrained(args.tokenizer)
-config = transformers.LlamaConfig.from_pretrained(args.model_path)
+model_config = transformers.LlamaConfig.from_pretrained(args.model_path)
 
 
 def tokenize_function(examples):
@@ -181,7 +181,7 @@ def benchmark(model):
 
     model = optimum_ort.ORTModelForCausalLM(
         session,  # pylint: disable=E1121
-        config,
+        model_config,
         use_cache=True if use_cache else False,
         use_io_binding=True if use_cache else False,
     )
@@ -259,8 +259,8 @@ def __init__(self,
             ).numpy().astype("int64")
             if use_cache:
                 # Create dummy past_key_values for decoder
-                num_attention_heads = config.num_key_value_heads
-                embed_size_per_head = config.hidden_size // config.num_attention_heads
+                num_attention_heads = model_config.num_key_value_heads
+                embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
                 shape = (self.batch_size, num_attention_heads, 0,
                          embed_size_per_head)
                 key_or_value = np.zeros(shape, dtype=np.float32)
@@ -339,8 +339,8 @@ def __init__(self,
             ort_input["position_ids"] = position_ids.numpy()
             if use_cache:
                 # create dummy past_key_values for decoder first generation step
-                num_attention_heads = config.num_key_value_heads
-                embed_size_per_head = config.hidden_size // config.num_attention_heads
+                num_attention_heads = model_config.num_key_value_heads
+                embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
                 shape = (self.batch_size, num_attention_heads, 0,
                          embed_size_per_head)
                 key_or_value = np.zeros(shape, dtype=np.float32)
@@ -457,5 +457,5 @@ def rewind(self):
                 os.path.join(args.output_model, model_name),
                 save_as_external_data=True,
             )
-            config.to_json_file(os.path.join(args.output_model, "config.json"),
+            model_config.to_json_file(os.path.join(args.output_model, "config.json"),
                                 use_diff=False)
diff --git a/neural_compressor_ort/config.py b/neural_compressor_ort/config.py
@@ -354,7 +354,7 @@ def register_supported_configs(cls):
 
     @classmethod
     def validate(self, user_config: BaseConfig):
-        # TODO(Yi) validate the user config
+        # TODO validate the user config
         pass
 
     def __add__(self, other: BaseConfig) -> BaseConfig:
@@ -391,7 +391,7 @@ def expand(self) -> List[BaseConfig]:
                 "global": { "weight_bits": 6}
             }
         case 2
-        # TODO (Yi) to support the expansion of config with `local`
+        # TODO to support the expansion of config with `local`
         {
             "global": {
                 "weight_bits": [4, 6]

diff --git a/neural_compressor_ort/quantization/matmul_nbits_quantizer.py b/neural_compressor_ort/quantization/matmul_nbits_quantizer.py
@@ -151,16 +151,16 @@ def _generate_nc_config(self):
         return nc_config
 
     def int4_quant_algo(self):
-        config = self._generate_nc_config()
+        qconfig = self._generate_nc_config()
 
         logger.info(f"start to quantize model with {self.algorithm} algorithm...")
         model = self.model_path or self.model
         if self.algorithm == "RTN":
-            self.model = algos.rtn_quantize_entry(model, config)
+            self.model = algos.rtn_quantize_entry(model, qconfig)
         elif self.algorithm == "GPTQ":
-            self.model = algos.gptq_quantize_entry(model, config, self.algo_config.calibration_data_reader)
+            self.model = algos.gptq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader)
         elif self.algorithm == "AWQ":
-            self.model = algos.awq_quantize_entry(model, config, self.algo_config.calibration_data_reader)
+            self.model = algos.awq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader)
         logger.info(f"complete quantization of model with {self.algorithm} algorithm.")
 
     def process(self):