intel · yintong-lu · Nov 17, 2023 · Nov 17, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
@@ -181,6 +181,7 @@ def smooth_quant(
             "alpha_step": 0.1,
             "shared_criterion": "mean",
             "do_blockwise": False,
+            "enable_bias_shift": False,
         },
         default_alpha=0.5,
     ):
@@ -201,6 +202,7 @@ def smooth_quant(
             auto_alpha_args: Hyperparameters used to set the alpha search space in SQ auto-tuning.
                             By default the search space is 0.0-1.0 with step_size 0.1.
                             do_blockwise: Whether to do blockwise auto-tuning.
+                            enable_bias_shift: Whether to do bias-shifting.
             default_alpha: A hyperparameter that is used in SQ auto-tuning; by default it is 0.5.
 
         Returns:

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -1743,6 +1743,7 @@ def smooth_quant(
             "alpha_step": 0.1,
             "shared_criterion": "mean",
             "do_blockwise": False,
+            "enable_bias_shift": False,
         },
         default_alpha=0.5,
     ):
@@ -1763,8 +1764,10 @@ def smooth_quant(
             auto_alpha_args: Hyperparameters used to set the alpha search space in SQ auto-tuning.
                             By default the search space is 0.0-1.0 with step_size 0.1.
                             do_blockwise determines whether to do blockwise auto-tuning.
+                            enable_bias_shift determines whether to do bias-shifting.
             default_alpha: A hyperparameter that is used in SQ auto-tuning; by default it is 0.5.
 
+
         Returns:
             model: A modified fp32 model, inplace=True.
         """
@@ -2017,6 +2020,10 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
 
         # For smoothquant optimized model
         recipe_cfgs = tune_cfg.get("recipe_cfgs", None)
+        if "smooth_quant_args" in recipe_cfgs and "auto_alpha_args" in recipe_cfgs["smooth_quant_args"]:
+            enable_bias_shift = recipe_cfgs["smooth_quant_args"]["auto_alpha_args"].get("enable_bias_shift", False)
+        else:
+            enable_bias_shift = False
         if (
             recipe_cfgs
             and recipe_cfgs.get("smooth_quant", False)
@@ -2025,7 +2032,12 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
         ):
             return self.qdq_quantize(q_model, tune_cfg)
 
-        if recipe_cfgs and recipe_cfgs.get("smooth_quant", False) and recipe_cfgs["smooth_quant_args"]["folding"]:
+        if (
+            recipe_cfgs
+            and recipe_cfgs.get("smooth_quant", False)
+            and recipe_cfgs["smooth_quant_args"]["folding"]
+            and not enable_bias_shift
+        ):
             self._apply_pre_optimization(q_model, tune_cfg)
 
         # For tensorboard display
@@ -2671,6 +2683,10 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
 
         # check smoothquant folding value
         recipe_cfgs = tune_cfg.get("recipe_cfgs", None)
+        if "smooth_quant_args" in recipe_cfgs and "auto_alpha_args" in recipe_cfgs["smooth_quant_args"]:
+            enable_bias_shift = recipe_cfgs["smooth_quant_args"]["auto_alpha_args"].get("enable_bias_shift", False)
+        else:
+            enable_bias_shift = False
         if "smooth_quant_args" in recipe_cfgs and "folding" in recipe_cfgs["smooth_quant_args"]:
             if recipe_cfgs["smooth_quant_args"]["folding"] is None:
                 if self.version.release < Version("2.1").release:
@@ -2679,6 +2695,8 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
                     folding = False
             else:
                 folding = recipe_cfgs["smooth_quant_args"]["folding"]
+            logger.debug(f"SQ Ipex whether to perform bias_shift: {enable_bias_shift}, folding: {folding}")
+
         # Update model parameter when smoothquant folding = False
         if (
             recipe_cfgs
@@ -2688,7 +2706,7 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
         ):
             return self.qdq_quantize(model, q_model, tune_cfg, dataloader, q_func)
         # Update model parameter when smoothquant folding = True
-        if recipe_cfgs and recipe_cfgs.get("smooth_quant", False) and folding:
+        if recipe_cfgs and recipe_cfgs.get("smooth_quant", False) and folding and not enable_bias_shift:
             self._apply_pre_optimization(model, tune_cfg)
 
         assert (
@@ -3514,14 +3532,23 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
 
         # For smoothquant optimized model
         recipe_cfgs = tune_cfg.get("recipe_cfgs", None)
+        if "smooth_quant_args" in recipe_cfgs and "auto_alpha_args" in recipe_cfgs["smooth_quant_args"]:
+            enable_bias_shift = recipe_cfgs["smooth_quant_args"]["auto_alpha_args"].get("enable_bias_shift", False)
+        else:
+            enable_bias_shift = False
         if (
             recipe_cfgs
             and recipe_cfgs.get("smooth_quant", False)
             and not recipe_cfgs["smooth_quant_args"]["folding"]
             and self.approach != "post_training_dynamic_quant"
         ):
             return self.qdq_quantize(q_model, tune_cfg)
-        if recipe_cfgs and recipe_cfgs.get("smooth_quant", False) and recipe_cfgs["smooth_quant_args"]["folding"]:
+        if (
+            recipe_cfgs
+            and recipe_cfgs.get("smooth_quant", False)
+            and recipe_cfgs["smooth_quant_args"]["folding"]
+            and not enable_bias_shift
+        ):
             self._apply_pre_optimization(q_model, tune_cfg)
 
         self.tune_cfg = tune_cfg

diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py
@@ -1839,6 +1839,7 @@ def smooth_quant(
             "alpha_step": 0.1,
             "shared_criterion": "mean",
             "do_blockwise": False,
+            "enable_bias_shift": False,
         },
         default_alpha=0.5,
     ):
@@ -1859,6 +1860,7 @@ def smooth_quant(
             auto_alpha_args: Hyperparameters used to set the alpha search space in SQ auto-tuning.
                             By default the search space is 0.0-1.0 with step_size 0.1.
                             do_blockwise: Whether to do blockwise auto-tuning.
+                            enable_bias_shift: Whether to do bias-shifting.
             default_alpha: A hyperparameter that is used in SQ auto-tuning; by default it is 0.5.
 
         Returns:

diff --git a/neural_compressor/adaptor/torch_utils/model_wrapper.py b/neural_compressor/adaptor/torch_utils/model_wrapper.py
@@ -593,3 +593,47 @@ def _recover_linear(self):
         scale = self.input_scale.view(1, self.input_scale.shape[0])
         with torch.no_grad():
             self.linear.weight *= scale
+
+
+class LlamaRMSNorm_bias(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, bias=None):
+        """LlamaRMSNorm is equivalent to T5LayerNorm.
+
+        Add bias attribute and modify forward function for bias-shifting.
+        """
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.bias = bias
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.bias is not None:
+            return self.weight * hidden_states.to(input_dtype) + self.bias.to(input_dtype)
+        else:
+            return self.weight * hidden_states.to(input_dtype)
+
+
+class MistralRMSNorm_bias(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, bias=None):
+        """MistralRMSNorm is equivalent to T5LayerNorm.
+
+        Add bias attribute and modify forward function for bias-shifting.
+        """
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.bias = bias
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if hasattr(self, "bias") and self.bias is not None:
+            return self.weight * hidden_states.to(input_dtype) + self.bias.to(input_dtype)
+        else:
+            return self.weight * hidden_states.to(input_dtype)