From b1c24935c8abf8e74536cd817e788281a45002c7 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 8 Jul 2025 05:17:52 +0300
Subject: [PATCH 1/2] fix packing

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 auto_round/export/export_to_autoround/qlinear_triton_act.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/qlinear_triton_act.py b/auto_round/export/export_to_autoround/qlinear_triton_act.py
index 59a238c75..a18409130 100644
--- a/auto_round/export/export_to_autoround/qlinear_triton_act.py
+++ b/auto_round/export/export_to_autoround/qlinear_triton_act.py
@@ -141,8 +141,7 @@ def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=Non
         else:
             repeat_zeros = zeros
 
-        intweight =  torch.round(W.to(device) / repeat_scales[:,:W.shape[1]] + repeat_zeros[:,:W.shape[1]])
-
+        intweight =  torch.round(W.to(device) / repeat_scales[:,:W.shape[1]] + repeat_zeros[:,:W.shape[1]]).to(torch.int32)
         del repeat_scales
         intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
         order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits

From fbe9146cb38cf250f2c0a51a86630f03a7bade7d Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 8 Jul 2025 06:05:39 +0300
Subject: [PATCH 2/2] fix format

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 auto_round/export/export_to_autoround/qlinear_triton_act.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/qlinear_triton_act.py b/auto_round/export/export_to_autoround/qlinear_triton_act.py
index a18409130..3fd951302 100644
--- a/auto_round/export/export_to_autoround/qlinear_triton_act.py
+++ b/auto_round/export/export_to_autoround/qlinear_triton_act.py
@@ -141,7 +141,9 @@ def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=Non
         else:
             repeat_zeros = zeros
 
-        intweight =  torch.round(W.to(device) / repeat_scales[:,:W.shape[1]] + repeat_zeros[:,:W.shape[1]]).to(torch.int32)
+        intweight = torch.round(W.to(device) / repeat_scales[:, : W.shape[1]] + repeat_zeros[:, : W.shape[1]]).to(
+            torch.int32
+        )
         del repeat_scales
         intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
         order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits