Skip to content

Commit

Permalink
change use_optimum_format=True and add bias (#1431)
Browse files Browse the repository at this point in the history
Signed-off-by: Xin He <xin3.he@intel.com>
  • Loading branch information
xin3he committed Dec 6, 2023
1 parent 87b3b18 commit 0a06448
Show file tree
Hide file tree
Showing 9 changed files with 129 additions and 113 deletions.
15 changes: 8 additions & 7 deletions docs/source/quantization_weight_only.md
Expand Up @@ -93,18 +93,19 @@ To support low memory inference, Neural Compressor implemented WeightOnlyLinear,
**Export arguments**
| export args | default value | comments |
|:----------:|:-------------:|:-------------------------------------------------------------------:|
| qweight_config_path | None | If need to export model with fp32_model and json file, set the path of qconfig.json |
| use_optimum_format | True | Whether to use the popular format used in [Optimum](https://github.com/huggingface/optimum/blob/e0927976d06d163ed09fe5bd80d013e1cfa0c463/docs/source/llm_quantization/usage_guides/quantization.mdx#L5) |
| sym_full_range | False | Whether to leverage the full compression range under symmetric quantization |
| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64] |
| compression_dim | 1 | 0 means output channel while 1 means input channel |
| scale_dtype | torch.float32 | Data type for scale and bias |
| use_hf_format | False | Whether to use the popular format present on HuggingFace hub |
| compression_dtype | torch.int32 | Data type for compressed dtype, select from [torch.int8\|16\|32\|64]. It's torch.int32 when use_optimum_format=True |
| compression_dim | 1 | 0 means output channel while 1 means input channel. It's 1 for weight and 0 for zero-point when use_optimum_format=True |
| scale_dtype | torch.float32 | Data type for scale and bias. It's torch.float16 when use_optimum_format=True |
| qweight_config_path | None | set the path of qconfig.json if you want to export model with json file |
| gptq_config_path | None | If need to export model with fp32_model and json file, set the path of gptq_config.json for GPTQ quantized model|

**Note:** HuggingFace format is quite special, the main differences are as follows:
**Note:** The format used in Optimum is acceptable for transformers, which makes it easy to use. However, this format is rather special, the main differences are as follows:

> 1: Compression Dimension: weight = 1, zero = 0 and both are transposed.
> 2: Zero Point: zero_point-= 1 before compression. zero_point is always required even for sym.
> 3: Group Index: Use the same number for a group instead of recording channel order.
> 3: Group Index: Use the same number for a group instead of recording channel order.

### **User Code Example**
Expand Down
4 changes: 3 additions & 1 deletion neural_compressor/adaptor/pytorch.py
Expand Up @@ -4582,10 +4582,12 @@ def rtn_quantize(self, model, tune_cfg):
enable_full_range = self.recipes["rtn_args"].get("enable_full_range", False)
enable_mse_search = self.recipes["rtn_args"].get("enable_mse_search", False)
group_dim = self.recipes["rtn_args"].get("group_dim", 1)
return_int = self.recipes["rtn_args"].get("return_int", False)
else: # pragma: no cover
enable_full_range = False
enable_mse_search = False
group_dim = 1
return_int = False
from .torch_utils.util import fetch_module, set_module
from .torch_utils.weight_only import rtn_quantize

Expand Down Expand Up @@ -4623,7 +4625,7 @@ def rtn_quantize(self, model, tune_cfg):
num_bits,
group_size,
scheme,
return_int=False,
return_int=return_int,
data_type=dtype,
enable_full_range=enable_full_range,
enable_mse_search=enable_mse_search,
Expand Down
80 changes: 41 additions & 39 deletions neural_compressor/adaptor/torch_utils/model_wrapper.py
Expand Up @@ -217,10 +217,10 @@ def __init__(
compression_dim=1,
g_idx=False,
device="cpu",
use_hf_format=False,
use_optimum_format=True,
):
super().__init__()
self.use_hf_format = use_hf_format
self.use_optimum_format = use_optimum_format
self.dtype = dtype
if "int" not in self.dtype: # for nf4, fp4
from neural_compressor.adaptor.torch_utils.weight_only import FLOAT_MAPPING, INT_MAPPING
Expand All @@ -245,13 +245,13 @@ def __init__(
dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64}
self.compress_bits = dtype_bits_mapping[compression_dtype]
self.n_pack = self.compress_bits // self.bits
self.compressed_dtype = compression_dtype
self.float_type = scale_dtype
# K is input channel, N is output channel
assert compression_dim in [0, 1], (
"Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
)
if self.use_hf_format:
if self.use_optimum_format:
self.float_type = torch.float16
self.compressed_dtype = torch.int32
self.register_buffer(
"scales",
torch.zeros(
Expand All @@ -276,7 +276,10 @@ def __init__(
).to(device),
)
self.qzeros = self.qzeros.T
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.compressed_dtype = compression_dtype
self.float_type = scale_dtype
self.register_buffer(
"scales",
torch.zeros(
Expand Down Expand Up @@ -316,18 +319,18 @@ def __init__(
dtype=self.compressed_dtype,
).to(device),
)
if bias:
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.bias = None
if g_idx:
self.register_buffer("g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
else:
self.g_idx = None
if bias:
self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
else:
self.bias = None

def pack(self, int_weight, scale, zp, bias, g_idx=None):
int_weight = int_weight.to(self.device)
if self.use_hf_format and zp is None:
if self.use_optimum_format and zp is None:
# to avoid overflow
int_weight = int_weight.type(torch.int32)
shift_bias = 2 ** (self.bits - 1)
Expand All @@ -339,13 +342,13 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
if g_idx is not None:
assert hasattr(self, "g_idx"), "g_idx is not set when initializing."
self.g_idx = g_idx.type(torch.int32).to(self.device)
if self.use_hf_format:
if self.use_optimum_format:
invperm = torch.argsort(self.g_idx)
self.g_idx = invperm // self.groupsize
self.g_idx = self.g_idx.type(torch.int32).to(self.device)
assert scale.shape == self.scales.shape, "Scale shape is mismatched."
self.scales = scale.type(self.float_type).to(self.device)
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
int_weight = int_weight.T
self.qweight = self.qweight.T
origin_shape = int_weight.shape
Expand All @@ -362,14 +365,14 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
tmp[:, e] &= mask
tmp[:, e] = tmp[:, e] << (self.bits * e)
self.qweight[:, j] |= tmp[:, e]
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
self.qweight = self.qweight.T

if zp is not None:
zp = zp.to(self.device)
if self.use_hf_format:
if self.use_optimum_format:
zp -= 1
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
self.qzeros = self.qzeros.T
assert hasattr(self, "qzeros"), "zp is not set when initializing."
Expand All @@ -382,23 +385,19 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
tmp[:, e] &= mask
tmp[:, e] = tmp[:, e] << (self.bits * e)
self.qzeros[:, j] |= tmp[:, e]
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
self.qzeros = self.qzeros.T
if self.use_hf_format:
if self.use_optimum_format:
self.scales = self.scales.T
self.qweight = self.qweight.T
self.g_idx = self.g_idx
self.qzeros = self.qzeros.T

def recover(self):
logger.debug(f"Recovering {self} weight")
if self.use_hf_format:
# Prevent broken id links of self.scales and self.scales
self.scales = self.scales.T
self.qweight = self.qweight.T
self.g_idx = self.g_idx
self.qzeros = self.qzeros.T
device = self.scales.device
scales = self.scales.T if self.use_optimum_format else self.scales
qweight = self.qweight.T if self.use_optimum_format else self.qweight

device = scales.device
fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
if self.g_idx is None:
# used for recovering fp32_weight
Expand All @@ -410,8 +409,7 @@ def recover(self):
weight_dtype = torch.int8
# unpack weight
weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
qweight = self.qweight
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
weight = weight.T
qweight = qweight.T
origin_shape = weight.shape
Expand All @@ -427,7 +425,7 @@ def recover(self):
if weight_dtype == torch.uint8:
tmp &= mask # remove sign bit
weight[:, index] = tmp.type(weight_dtype)
if not self.use_hf_format and self.compression_dim == 0:
if not self.use_optimum_format and self.compression_dim == 0:
weight = weight.T
if "int" not in self.dtype:
new_weight = torch.zeros(self.out_features, self.in_features).to(device)
Expand All @@ -437,9 +435,9 @@ def recover(self):
# unpack zero_point
if hasattr(self, "qzeros"):
zp_dtype = self.compressed_dtype # to avoid overflow when weight-zp
zp = torch.zeros(self.scales.shape, dtype=zp_dtype).to(device)
qzeros = self.qzeros
if self.use_hf_format or self.compression_dim == 0:
zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device)
qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
qzeros = qzeros.T
origin_shape = zp.shape
Expand All @@ -454,30 +452,34 @@ def recover(self):
tmp = tmp >> self.compress_bits - self.bits
tmp &= mask
zp[:, index] = tmp.type(zp_dtype)
if self.use_hf_format or self.compression_dim == 0:
if self.use_optimum_format or self.compression_dim == 0:
zp = zp.T
if self.use_hf_format:
if self.use_optimum_format:
# zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
zp += 1
zp = torch.where(zp > (2**self.bits - 1), 0, zp)
# recover fp32 weight with int_weight, scale, and zero_point
for idx in range(self.in_features):
fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * self.scales[:, self.g_idx[idx]]
fp32_weight[:, idx] = (weight[:, idx] - zp[:, self.g_idx[idx]]) * scales[:, self.g_idx[idx]]
else:
# recover fp32 weight with int_weight, scale
for idx in range(self.in_features):
fp32_weight[:, idx] = weight[:, idx] * self.scales[:, self.g_idx[idx]]
fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]]
return fp32_weight

def forward(self, input):
weight = self.recover()
device = self.scales.device
if weight.dtype == torch.float16 and device.type == "cpu":
weight = weight.float()
self.bias = self.bias.float() if self.bias is not None else None
if level == DEBUG:
if not hasattr(self, "weight"):
self.weight = self.recover()
self.weight = weight
input = input.type(self.weight.dtype)
logger.debug(f"Calculating {self}")
return F.linear(input, self.weight, self.bias)
else:
weight = self.recover()
input = input.type(weight.dtype)
return F.linear(input, weight, self.bias)

Expand All @@ -489,8 +491,8 @@ def extra_repr(self) -> str:
self.groupsize,
self.bias is not None,
)
if self.use_hf_format:
tmp_str += ", use_hf_format=True"
if self.use_optimum_format:
tmp_str += ", use_optimum_format=True"
return tmp_str


Expand Down
4 changes: 2 additions & 2 deletions neural_compressor/adaptor/torch_utils/weight_only.py
Expand Up @@ -396,7 +396,7 @@ def rtn_quantize(
compression_dim = kwargs.get("compression_dim", 1)
scale_dtype = kwargs.get("scale_dtype", torch.float32)
device = kwargs.get("device", "cpu")
use_hf_format = kwargs.get("use_hf_format", False)
use_optimum_format = kwargs.get("use_optimum_format", True)
for name, m in model.named_modules():
if m.__class__.__name__ not in supported_layers:
continue
Expand Down Expand Up @@ -452,7 +452,7 @@ def rtn_quantize(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
new_module.pack(int_weight, scale, zp, m.bias)
if name == "":
Expand Down
10 changes: 5 additions & 5 deletions neural_compressor/model/torch_model.py
Expand Up @@ -459,7 +459,7 @@ def export_compressed_model(
scale_dtype=torch.float32,
gptq_config_path=None,
device="cpu",
use_hf_format=False,
use_optimum_format=True,
):
"""Convert Linear to WeightOnlyLinear for low memory inference.
Expand All @@ -475,7 +475,7 @@ def export_compressed_model(
Defaults to torch.float32.
gptq_config_path (str, optional): Path of gptq_config.json. Defaults to None.
device (str, optional): choose device for compression. Defaults to cpu.
use_hf_format (bool, optional): use the popular huggingface compression format.
use_optimum_format (bool, optional): use the popular huggingface compression format.
1: compression_dim: weight = 1, zeros = 0 and both are transposed.
2: zeros -= 1 before compression. Why we need it?
3: g_idx: use same number for one group instead of recording the channel order.
Expand Down Expand Up @@ -520,7 +520,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
set_module(self.model, k, new_module)
continue
Expand Down Expand Up @@ -551,7 +551,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
set_module(self.model, k, new_module)
Expand All @@ -578,7 +578,7 @@ def export_compressed_model(
compression_dim=compression_dim,
scale_dtype=scale_dtype,
device=device,
use_hf_format=use_hf_format,
use_optimum_format=use_optimum_format,
)
set_module(self.model, k, mod)
return self.model
Expand Down

0 comments on commit 0a06448

Please sign in to comment.