Skip to content

Commit

Permalink
Refactor AWQ algo to enhance memory, computation and support foldng=F…
Browse files Browse the repository at this point in the history
…alse (#1130)

Refactor AWQ algo to enhance memory, computation and support foldng=False

Signed-off-by: Xin He <xin3.he@intel.com>

---------

Signed-off-by: Xin He <xin3.he@intel.com>
Signed-off-by: Lv, Kaokao <kaokao.lv@intel.com>
Co-authored-by: Lv, Kaokao <kaokao.lv@intel.com>
  • Loading branch information
xin3he and lkk12014402 committed Aug 16, 2023
1 parent 36d0bcb commit 641d42b
Show file tree
Hide file tree
Showing 14 changed files with 998 additions and 474 deletions.
5 changes: 4 additions & 1 deletion .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
Expand Up @@ -2659,9 +2659,11 @@ classDef
bdf
bmm
AWQ
awq
GPTQ
gptq
RTN
awq
rtn
gptq
percdamp
Frantar
Expand Down Expand Up @@ -2693,6 +2695,7 @@ hostname
qweight
qconfig
TEQ
teq
WeightOnlyLinear
McKinstry
Migacz
Expand Down
10 changes: 8 additions & 2 deletions docs/source/quantization_weight_only.md
Expand Up @@ -40,12 +40,18 @@ There are many excellent works for weight only quantization to improve its accur
| scheme | ['asym', 'sym'] |
| algorithm | ['RTN', 'AWQ'] |

**RTN arguments**:
| rtn_args | default value | comments |
|:----------:|:-------------:|:-------------------------------------------------------------------:|
| sym_full_range | False | Whether use -2**(bits-1) in sym scheme, for example, |
| return_int | False | Whether return compressed model with int data type |

**AWQ arguments**:
| awq_args | default value | comments |
|:----------:|:-------------:|:-------------------------------------------------------------------:|
| auto_scale | True | Whether search for best scales based on activation distribution |
| auto_scale | True | Whether search for best scales based on activation distribution |
| mse_range | True | Whether search for the best clip range from range [0.89, 1.0, 0.01] |
| n_blocks | 5 | Split the model into n blocks for AWQ search to avoid out-of-memory |
| folding | False | False will allow insert mul before linear when the scale cannot be absorbed by last layer, else won't |


**Note**: `group_size=-1` indicates the per-channel quantization per output channel. `group_size=[1-N]` indicates splitting the input channel elements per group_size.
Expand Down
88 changes: 26 additions & 62 deletions neural_compressor/adaptor/pytorch.py
Expand Up @@ -4313,6 +4313,8 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
else:
algorithm = config['weight']['algorithm']
all_algo.add(algorithm)
if len(all_algo):
logger.info(f"All algorithms to do: {all_algo}")
if 'GPTQ' in all_algo:
q_model._model, gptq_config = self.gptq_quantize(
q_model._model, tune_cfg, dataloader
Expand All @@ -4322,7 +4324,7 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
q_model._model = self.teq_quantize(q_model._model, tune_cfg, dataloader, calib_func)
if 'AWQ' in all_algo: # includes RTN in AWQ
q_model._model = self.awq_quantize(q_model._model, tune_cfg, dataloader, calib_func)
elif 'RTN' in all_algo:
if 'RTN' in all_algo:
q_model._model = self.rtn_quantize(q_model._model, tune_cfg)

q_model.q_config = copy.deepcopy(self.tune_cfg)
Expand All @@ -4331,7 +4333,7 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
return q_model

def rtn_quantize(self, model, tune_cfg):
logger.debug("quantizing with the round-to-nearest algorithm")
logger.info("quantizing with the round-to-nearest algorithm")
if 'rtn_args' in self.recipes:
sym_full_range = self.recipes['rtn_args'].get('sym_full_range', False)
else:
Expand All @@ -4357,7 +4359,7 @@ def rtn_quantize(self, model, tune_cfg):
return model

def gptq_quantize(self, model, tune_cfg, dataloader):
logger.debug("quantizing with the GPTQ algorithm")
logger.info("quantizing with the GPTQ algorithm")
from .torch_utils.weight_only import gptq_quantize
# convert tune_cfg to gptq_quantize's weight config
"""please refer to weight_config which can be analyzed by user-define API function weight_only.gptq_quantize
Expand Down Expand Up @@ -4403,7 +4405,7 @@ def gptq_quantize(self, model, tune_cfg, dataloader):
return model, quantization_perm

def teq_quantize(self, model, tune_cfg, dataloader, calib_func):
logger.debug("quantizing with the TEQ algorithm")
logger.info("quantizing with the TEQ algorithm")
from .torch_utils.weight_only import teq_quantize
# get example inputs if not provided.
if self.example_inputs is None: # pragma: no cover
Expand Down Expand Up @@ -4490,90 +4492,52 @@ def teq_quantize(self, model, tune_cfg, dataloader, calib_func):
return model

def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
logger.debug("quantizing with the AWQ algorithm")
logger.info("quantizing with the AWQ algorithm")
from .torch_utils.weight_only import awq_quantize
# get example inputs if not provided.
if self.example_inputs is None:
if dataloader is None:
assert False, "Please provide dataloader or example_inputs for AWQ algorithm."
try:
for idx, (input, label) in enumerate(dataloader):
self.example_inputs = input
break
except:
for idx, input in enumerate(dataloader):
self.example_inputs = input
break
from neural_compressor.adaptor.torch_utils.util import get_example_input
assert dataloader is not None, "datalaoder or example_inputs is required."
self.example_inputs = get_example_input(dataloader)

# get modules that can be absorbed.
from .torch_utils.smooth_quant import GraphTrace
tg = GraphTrace()
supported_layers = ['Linear']
absorb_to_layer, _ = tg.get_absorb_to_layer(model, self.example_inputs, supported_layers)
if absorb_to_layer is None or absorb_to_layer == {}:
logger.warning('No absorb layer is detected, skip AWQ algorithm')
return model

# got flipped dict from absorb_to_layer dict
flipped_dict = {}
for k, v in absorb_to_layer.items():
for m in v:
flipped_dict[m] = {'absorb_layer': k}

# check tune_cfg to skip layers without AWQ config
# build weight_config
weight_config = {}
skipped_op_name_set = set()
for key, config in tune_cfg['op'].items():
op_name, op_type = key
if config['weight']['dtype'] == 'fp32':
if op_name in flipped_dict:
absorb_to_layer.pop(flipped_dict[op_name]['absorb_layer'])
continue
weight_config[op_name] = {
'bits': -1, # skip quantization
'group_size': 128,
'scheme': 'asym',
'algorithm': 'RTN',
}
else:
weight_config[op_name] = {}
weight_config[op_name]['bits'] = config['weight']['bits']
weight_config[op_name]['group_size'] = config['weight']['group_size']
weight_config[op_name]['scheme'] = config['weight']['scheme']
if op_name in flipped_dict:
algorithm = config['weight']['algorithm']
if algorithm != 'AWQ':
absorb_to_layer.pop(weight_config[op_name]['absorb_layer'])
else:
skipped_op_name_set.add(op_name)
if skipped_op_name_set:
logger.info("{} is skipped by AWQ algorithm".format(skipped_op_name_set))

# collect AWQ config from tune_cfg for quantization.
if len(absorb_to_layer) == 0:
logger.warning('No absorb layer needs AWQ algorithim, skip it')
else:
logger.debug("**absorb layer**: **absorbed layers**")
for k, v in absorb_to_layer.items():
logger.debug(f"{k}: {v}")
logger.info("Absorbed layers with the same absorb layer use the same config")
weight_config[op_name] = config['weight']

if 'awq_args' in self.recipes:
auto_scale = self.recipes['awq_args'].get('auto_scale', True)
mse_range = self.recipes['awq_args'].get('mse_range', True)
n_blocks = self.recipes['awq_args'].get('n_blocks', 5)
folding = self.recipes['awq_args'].get('folding', False)
else:
auto_scale, mse_range = True, True
auto_scale, mse_range, folding = True, True, False
if 'rtn_args' in self.recipes:
sym_full_range = self.recipes['rtn_args'].get('sym_full_range', False)
return_int = self.recipes['rtn_args'].get('return_int', False)
else:
sym_full_range=False
sym_full_range, return_int = False, False
calib_sampling_size = tune_cfg.get('calib_sampling_size', 1)
model = awq_quantize(
model,
bits=-1, # no quantize for op not in weight_config
example_inputs=self.example_inputs,
weight_config=weight_config,
absorb_dict=absorb_to_layer,
dataloader=dataloader,
n_samples=calib_sampling_size,
auto_scale=auto_scale,
mse_range=mse_range,
calib_func=calib_func,
n_blocks=n_blocks,
return_int=False,
folding=folding,
return_int=return_int,
sym_full_range=sym_full_range,
)
return model
Expand Down

0 comments on commit 641d42b

Please sign in to comment.