refine autoround export (#1711)

Signed-off-by: changwangss <chang1.wang@intel.com>
intel · Apr 3, 2024 · 4092311 · 4092311
1 parent 7ee7215
commit 4092311
Showing 1 changed file with 48 additions and 14 deletions.
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
@@ -498,8 +498,7 @@ def export_compressed_model(
             gptq_config = self.gptq_config if hasattr(self, "gptq_config") else {}
 
         autoround_config = self.autoround_config if hasattr(self, "autoround_config") else {}
-
-        if gptq_config or (autoround_config and device == "xpu"):
+        if gptq_config:
             for k, v in weight_config.items():
                 logger.debug(f"Compressing {k} on device {device}")
                 if v["dtype"] == "fp32":
@@ -558,19 +557,54 @@ def export_compressed_model(
                 )
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
-        elif autoround_config and (device == "cpu" or device == "auto"):
-            from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
+        elif autoround_config:
+            if device == "xpu":
+                for k, v in weight_config.items():
+                    logger.debug(f"Compressing {k} on device {device}")
+                    if v["dtype"] == "fp32":
+                        continue
+                    else:
+                        dtype = v["dtype"]
+                        num_bits = v["bits"]
+                        group_size = v["group_size"]
+                        scheme = v["scheme"]
+                    m = fetch_module(self.model, k)
+                    autoround_conf = autoround_config[k]
+                    fp32_weight = m.weight.data
+                    autoround_scale = torch.tensor(autoround_conf["scale"], dtype=torch.float32)
+                    autoround_zp = None if scheme == "sym" else torch.tensor(autoround_conf["zero"], dtype=torch.int32)
+                    int_weight = quant_weight_w_scale(fp32_weight, autoround_scale, autoround_zp, group_size)
+                    int_weight = int_weight.type(torch.int32)
+                    new_module = WeightOnlyLinear(
+                        m.in_features,
+                        m.out_features,
+                        num_bits,
+                        group_size,
+                        dtype=dtype,
+                        zp=autoround_zp is not None,
+                        bias=m.bias is not None,
+                        g_idx=None,
+                        compression_dtype=compression_dtype,
+                        compression_dim=compression_dim,
+                        scale_dtype=scale_dtype,
+                        device=device,
+                        use_optimum_format=use_optimum_format,
+                    )
+                    new_module.pack(int_weight, autoround_scale, autoround_zp, m.bias, None)
+                    set_module(self.model, k, new_module)
+            else:
+                from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
 
-            self.model = pack_model(
-                self.model,
-                weight_config=autoround_config,
-                enable_full_range=enable_full_range,
-                compression_dtype=compression_dtype,
-                compression_dim=compression_dim,
-                device=device,
-                use_optimum_format=use_optimum_format,
-                inplace=True,
-            )
+                self.model = pack_model(
+                    self.model,
+                    weight_config=autoround_config,
+                    enable_full_range=enable_full_range,
+                    compression_dtype=compression_dtype,
+                    compression_dim=compression_dim,
+                    device=device,
+                    use_optimum_format=use_optimum_format,
+                    inplace=True,
+                )
         else:
             for k, v in weight_config.items():
                 logger.debug(f"Compressing {k} on device {device}")