Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <mengni.wang@intel.com>
  • Loading branch information
mengniwang95 committed May 13, 2024
1 parent 687e1e5 commit 7d5e551
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 39 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ best_model = quant.model
### Static Quantization

```python
from neural_compressor_ort.quantization import quantize, StaticQuantConfig
from neural_compressor_ort.quantization.calibrate import CalibrationDataReader
from neural_compressor_ort import config
from neural_compressor_ort.quantization import quantize
from neural_compressor_ort.quantization import calibrate


class DataReader(CalibrationDataReader):
class DataReader(calibrate.CalibrationDataReader):
def __init__(self):
self.encoded_list = []
# append data into self.encoded_list
Expand All @@ -84,8 +85,8 @@ class DataReader(CalibrationDataReader):


data_reader = DataReader()
config = StaticQuantConfig(calibration_data_reader=data_reader)
quantize(model, output_model_path, config)
qconfig = config.StaticQuantConfig(calibration_data_reader=data_reader)
quantize(model, output_model_path, qconfig)
```

## Documentation
Expand Down
31 changes: 16 additions & 15 deletions docs/quantization.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,28 +123,29 @@ User could execute:
This means user could leverage ONNX Neural Compressor to directly generate a fully quantized model without accuracy aware tuning. It's user responsibility to ensure the accuracy of the quantized model meets expectation. ONNX Neural Compressor supports `Post Training Static Quantization` and `Post Training Dynamic Quantization`.

``` python
from neural_compressor_ort.quantization import StaticQuantConfig, DynamicQuantConfig, quantize
from neural_compressor_ort.quantization.calibrate import CalibrationDataReader
from neural_compressor_ort import config
from neural_compressor_ort.quantization import quantize
from neural_compressor_ort.quantization import calibrate


class DataReader(CalibrationDataReader):
class DataReader(calibrate.CalibrationDataReader):
def get_next(self): ...

def rewind(self): ...


calibration_data_reader = DataReader() # only needed by StaticQuantConfig
config = StaticQuantConfig(calibration_data_reader) # or config = DynamicQuantConfig()
quantize(model, q_model_path, config)
qconfig = config.StaticQuantConfig(calibration_data_reader) # or qconfig = DynamicQuantConfig()
quantize(model, q_model_path, qconfig)
```

2. With Accuracy Aware Tuning

This means user could leverage the advance feature of ONNX Neural Compressor to tune out a best quantized model which has best accuracy and good performance. User should provide `eval_fn`.

``` python
from neural_compressor_ort.base_tuning import Evaluator, TuningConfig
from neural_compressor_ort.quantization import (
from neural_compressor_ort.quantization import calibrate
from neural_compressor_ort.quantization import tuning
CalibrationDataReader,
GPTQConfig,
RTNConfig,
Expand All @@ -153,7 +154,7 @@ from neural_compressor_ort.quantization import (
)


class DataReader(CalibrationDataReader):
class DataReader(calibrate.CalibrationDataReader):
def get_next(self): ...

def rewind(self): ...
Expand All @@ -162,11 +163,11 @@ class DataReader(CalibrationDataReader):
data_reader = DataReader()

# TuningConfig can accept:
# 1) a set of candidate configs like TuningConfig(config_set=[RTNConfig(weight_bits=4), GPTQConfig(weight_bits=4)])
# 2) one config with a set of candidate parameters like TuningConfig(config_set=[GPTQConfig(weight_group_size=[32, 64])])
# 3) our pre-defined config set like TuningConfig(config_set=get_woq_tuning_config())
custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_bits=4), GPTQConfig(weight_bits=4)])
best_model = autotune(
# 1) a set of candidate configs like tuning.TuningConfig(config_set=[config.RTNConfig(weight_bits=4), config.GPTQConfig(weight_bits=4)])
# 2) one config with a set of candidate parameters like tuning.TuningConfig(config_set=[config.GPTQConfig(weight_group_size=[32, 64])])
# 3) our pre-defined config set like tuning.TuningConfig(config_set=config.get_woq_tuning_config())
custom_tune_config = tuning.TuningConfig(config_set=[config.RTNConfig(weight_bits=4), config.GPTQConfig(weight_bits=4)])
best_model = tuning.autotune(
model_input=model,
tune_config=custom_tune_config,
eval_fn=eval_fn,
Expand All @@ -178,8 +179,8 @@ best_model = autotune(
ONNX Neural Compressor support specify quantization rules by operator name. Users can use `set_local` API of configs to achieve the above purpose by below code:

```python
fp32_config = GPTQConfig(weight_dtype="fp32")
quant_config = GPTQConfig(
fp32_config = config.GPTQConfig(weight_dtype="fp32")
quant_config = config.GPTQConfig(
weight_bits=4,
weight_dtype="int",
weight_sym=False,
Expand Down
12 changes: 6 additions & 6 deletions docs/smooth_quant.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,9 @@ There are two ways to apply smooth quantization: 1) using a fixed `alpha` for th
To set a fixed alpha for the entire model, users can follow this example:
```python
from neural_compressor_ort.quantization import StaticQuantConfig
from neural_compressor_ort import config quantization
config = StaticQuantConfig(
qconfig = config.StaticQuantConfig(
data_reader, extra_options={"SmoothQuant": True, "SmoothQuantAlpha": 0.5, "SmoothQuantFolding": True}
)
```
Expand All @@ -347,10 +347,10 @@ Here is an example:
from neural_compressor_ort import config
from neural_compressor_ort.quantization import tuning
config = tuning.TuningConfig(config_set=[config.SmoothQuantConfig(alpha=np.arange(0.1, 0.5, 0.05).tolist())])
qconfig = tuning.TuningConfig(config_set=[config.SmoothQuantConfig(alpha=np.arange(0.1, 0.5, 0.05).tolist())])
best_model = tuning.autotune(
model_input=model,
tune_config=config,
tune_config=qconfig,
eval_fn=eval_fn,
calibration_data_reader=data_reader,
)
Expand All @@ -363,7 +363,7 @@ Here is an example:
from neural_compressor_ort import config
from neural_compressor_ort.quantization import quantize
config = config.StaticQuantConfig(
qconfig = config.StaticQuantConfig(
data_reader,
extra_options={
"SmoothQuant": True,
Expand All @@ -377,7 +377,7 @@ config = config.StaticQuantConfig(
},
},
)
quantize(model, output_model_path, config)
quantize(model, output_model_path, qconfig)
```
## Reference
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@

# load model
tokenizer = transformers.LlamaTokenizer.from_pretrained(args.tokenizer)
config = transformers.LlamaConfig.from_pretrained(args.model_path)
model_config = transformers.LlamaConfig.from_pretrained(args.model_path)


def tokenize_function(examples):
Expand Down Expand Up @@ -181,7 +181,7 @@ def benchmark(model):

model = optimum_ort.ORTModelForCausalLM(
session, # pylint: disable=E1121
config,
model_config,
use_cache=True if use_cache else False,
use_io_binding=True if use_cache else False,
)
Expand Down Expand Up @@ -259,8 +259,8 @@ def __init__(self,
).numpy().astype("int64")
if use_cache:
# Create dummy past_key_values for decoder
num_attention_heads = config.num_key_value_heads
embed_size_per_head = config.hidden_size // config.num_attention_heads
num_attention_heads = model_config.num_key_value_heads
embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
shape = (self.batch_size, num_attention_heads, 0,
embed_size_per_head)
key_or_value = np.zeros(shape, dtype=np.float32)
Expand Down Expand Up @@ -339,8 +339,8 @@ def __init__(self,
ort_input["position_ids"] = position_ids.numpy()
if use_cache:
# create dummy past_key_values for decoder first generation step
num_attention_heads = config.num_key_value_heads
embed_size_per_head = config.hidden_size // config.num_attention_heads
num_attention_heads = model_config.num_key_value_heads
embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
shape = (self.batch_size, num_attention_heads, 0,
embed_size_per_head)
key_or_value = np.zeros(shape, dtype=np.float32)
Expand Down Expand Up @@ -457,5 +457,5 @@ def rewind(self):
os.path.join(args.output_model, model_name),
save_as_external_data=True,
)
config.to_json_file(os.path.join(args.output_model, "config.json"),
model_config.to_json_file(os.path.join(args.output_model, "config.json"),
use_diff=False)
4 changes: 2 additions & 2 deletions neural_compressor_ort/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def register_supported_configs(cls):

@classmethod
def validate(self, user_config: BaseConfig):
# TODO(Yi) validate the user config
# TODO validate the user config
pass

def __add__(self, other: BaseConfig) -> BaseConfig:
Expand Down Expand Up @@ -391,7 +391,7 @@ def expand(self) -> List[BaseConfig]:
"global": { "weight_bits": 6}
}
case 2
# TODO (Yi) to support the expansion of config with `local`
# TODO to support the expansion of config with `local`
{
"global": {
"weight_bits": [4, 6]
Expand Down
8 changes: 4 additions & 4 deletions neural_compressor_ort/quantization/matmul_nbits_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,16 @@ def _generate_nc_config(self):
return nc_config

def int4_quant_algo(self):
config = self._generate_nc_config()
qconfig = self._generate_nc_config()

logger.info(f"start to quantize model with {self.algorithm} algorithm...")
model = self.model_path or self.model
if self.algorithm == "RTN":
self.model = algos.rtn_quantize_entry(model, config)
self.model = algos.rtn_quantize_entry(model, qconfig)
elif self.algorithm == "GPTQ":
self.model = algos.gptq_quantize_entry(model, config, self.algo_config.calibration_data_reader)
self.model = algos.gptq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader)
elif self.algorithm == "AWQ":
self.model = algos.awq_quantize_entry(model, config, self.algo_config.calibration_data_reader)
self.model = algos.awq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader)
logger.info(f"complete quantization of model with {self.algorithm} algorithm.")

def process(self):
Expand Down

0 comments on commit 7d5e551

Please sign in to comment.