/
configuration.py
141 lines (125 loc) · 9.8 KB
/
configuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright 2023 The HuggingFace Team. All rights reserved.
# Licensed under the MIT License.
from dataclasses import dataclass
from typing import Dict, List, Literal, Optional
@dataclass
class BrevitasQuantizationConfig:
"""
QuantizationConfig is the configuration class handling all the Brevitas quantization parameters.
Args:
weights_bitwidth (`int`, defaults to `8`):
Bitwidth of the weights quantization. For example, with `weights_bitwidth=8`, each weight value is quantized on 8 bits.
activations_bitwidth (`Optional[int]`, defaults to `8`):
Bitwidth of the activations quantization.
weights_only (`bool`, defaults to `False`):
If set to `True`, only weights are to be quantized, otherwise activations are quantized as well.
weights_param_method (`str`, defaults to `stats`):
Strategy to use to estimate the quantization parameters (scale, zero-point) for the weights. Two strategies are available:
- `"stats"`: Use min-max to estimate the range to quantize on.
- `"mse"`: Use mean-square error between the unquantized weights and quantized weights to estimate the range to quantize on.
weights_symmetric (`bool`, defaults to `True`):
Whether to use symmetric quantization on the weights.
scale_precision (`str`, defaults to `"float_scale"`):
Precise the constraints on the scale. Can either be `"float_scale"` (arbitrary scales), or `"power_of_two_scale"` (scales constrainted to be a power of 2).
weights_quant_granularity (`str`, defaults to `"per_tensor"`):
The granularity of the quantization of the weights. This parameter can either be:
- `"per_tensor"`: A single scale (and possibly zero-point) is used for one weight matrix.
- `"per_channel"`: Each column (outer dimension) of the weight matrix has its own scale (and possibly zero-point).
- `"per_group"`: Each column of the weight matrix may have several scales, grouped by `weight_group_size`.
weights_group_size (`Optional[int]`, defaults to `None`):
Group size to use for the weights in case `weights_quant_granularity="per_group"`. Defaults to `128` in this case, to `None` otherwise.
quantize_zero_point (`bool`, defaults to `True`):
When set to True, the unquantized value 0.0 is exactly representable as a quantized value: the zero point. When set to False, a quantization range [a, b] is exactly reprensentable (no rounding on a and b), but the unquantized value zero is not exactly representable.
activations_param_method (`List[str]`):
Strategy to use to estimate the quantization parameters (scale, zero-point) for the activations. Two strategies are available:
- `"stats"`: Use min-max to estimate the range to quantize on.
- `"mse"`: Use mean-square error between the unquantized activations and quantized activations to estimate the range to quantize on.
is_static (`bool`, defaults to `False`):
Whether to apply static quantization or dynamic quantization.
activations_symmetric (`bool`, defaults to `False`):
Whether to use symmetric quantization on the activations.
activations_quant_granularity (`str`, defaults to `"per_tensor"`):
The granularity of the quantization of the activations. This parameter can either be `"per_tensor"`, `"per_row"` or `"per_group"`. In case static quantization is used (`is_static=True`), only `"per_tensor"` may be used.
activations_group_size (`int`, defaults to `None`):
Group size to use for the activations in case `activations_quant_granularity="per_group"`. Defaults to `64` in this case, to `None` otherwise.
activations_equalization (`Optional[str]`, defaults to `"cross_layer"`):
Whether to apply activation equalization (SmoothQuant). Possible options are:
- `None`: No activation equalization.
- `"layerwise"`: Apply SmoothQuant as described in https://arxiv.org/abs/2211.10438. The activation rescaling will be added as multiplication node, that is not fused within a preceding layer.
- `"cross_layer"`: Apply SmoothQuant, and fuse the activation rescaling within a preceding layer when possible (example: nn.LayerNorm followed by nn.Linear). This is achieved through a graph capture of the model using [torch.fx](https://pytorch.org/docs/stable/fx.html#module-torch.fx).
apply_weight_equalization (`bool`, defaults to `False`):
Applies weight equalization across layers, following https://arxiv.org/abs/1906.04721. This parameter is useful for models whose activation function is linear or piecewise-linear (like ReLU, used in OPT model), and allows to reduce the quantization error of the weights by balancing scales across layers.
apply_bias_correction (`bool`, defaults to `False`):
Applies bias correction to compensate for changes in activation bias caused by quantization.
apply_gptq (`bool`, defaults to `False`):
Whether to apply GPTQ algorithm for quantizing the weights.
gptq_act_order (`Optional[bool]`, defaults to `None`):
Whether to use activations reordering (act-order, also known as desc-act) when `apply_gptq=True`. If `apply_gptq=True`, defaults to `False`.
layers_to_exclude (`Optional[List]`, defaults to `None`):
Specify the names of the layers that should not be quantized. This should only be the last part of the layer name. If the same name is repeated across multiple layers, they will all be excluded.
If left to None, the last linear layer is automatically identified and excluded.
"""
weights_bitwidth: int = 8
activations_bitwidth: Optional[int] = 8
weights_only: bool = False
weights_param_method: Literal["stats", "mse"] = "stats"
weights_symmetric: bool = True
scale_precision: Literal["float_scale", "power_of_two_scale"] = "float_scale"
weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_tensor"
weights_group_size: Optional[int] = None
quantize_zero_point: bool = True
activations_param_method: Optional[Literal["stats", "mse"]] = "stats"
is_static: bool = False
activations_symmetric: Optional[bool] = False
activations_quant_granularity: Optional[Literal["per_tensor", "per_row", "per_group"]] = "per_tensor"
activations_group_size: Optional[int] = None
activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = "cross_layer"
apply_weight_equalization: bool = False
apply_bias_correction: bool = False
apply_gptq: bool = False
gptq_act_order: Optional[bool] = None
device: str = "auto"
layers_to_exclude: Optional[List] = None
gpu_device_map: Optional[Dict[int, float]] = None
cpu_device_map: Optional[Dict[str, float]] = None
def __post_init__(self):
if self.activations_quant_granularity == "per_group" and self.activations_group_size is None:
self.activations_group_size = 64
if self.weights_quant_granularity == "per_group" and self.weights_group_size is None:
self.weights_group_size = 128
if self.apply_gptq and self.gptq_act_order is None:
self.gptq_act_order = False
if self.is_static and self.activations_quant_granularity != "per_tensor":
raise ValueError(
f'Static quantization with activations_quant_granularity="{self.activations_quant_granularity}" is not supported. The quantization granularity must be activations_quant_granularity="per_tensor" when using static quantization.'
)
if self.weights_quant_granularity == "per_group" and self.weights_param_method == "mse":
raise ValueError(
'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_param_method="mse"`. Per group MSE weight quantization is not supported.'
)
if self.scale_precision == "power_of_two_scale" and (
not self.weights_symmetric or not self.activations_symmetric
):
raise ValueError(
'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `weights_symmetric=True` or along `activations_symmetric=True`. Asymmetric quantization with power-of-two scale is not supported.'
)
if self.scale_precision == "power_of_two_scale" and self.weights_quant_granularity == "per_group":
raise ValueError(
'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `weights_quant_granularity="per_group"`. Per group quantization with power-of-two scale factors is not supported.'
)
if not self.is_static and self.activations_quant_granularity == "per_group" and not self.activations_symmetric:
raise ValueError(
'The quantization configuration `activations_quant_granularity="per_group"` is not supported along `activations_symmetric=False`. Asymmetric dynamic per group quantization is not supported.'
)
if self.scale_precision == "power_of_two_scale" and not self.is_static:
raise ValueError(
'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `is_static=False`. Dynamic activation quantization with power-of-two scale factor is not supported.'
)
if self.weights_only:
self.activations_bitwidth = None
self.activations_symmetric = None
self.activations_equalization = None
self.activations_group_size = None
self.activations_param_method = None
def requires_fx_graph(self):
return self.activations_equalization == "cross_layer" or self.apply_weight_equalization