diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 0ab18b91b0c..f09262fb01d 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -660,9 +660,13 @@ def convert_to_GPTQ_checkpoints(model, quantization_config): new_module.n_pack = 32 // bits scales = module._op_context.get_scales().t().contiguous() bias = module._op_context.get_bias() - qzeros = new_module.pack_tensor_with_numpy( - module._op_context.get_zero_points().t().to(torch.uint8) - 1 - ).contiguous() + qzeros = module._op_context.get_zero_points().t().to(torch.uint8) + # For group_size = -1, the dimensions of scale and qzeros will be 1 + if len(scales.shape) == 1: + scales = scales.unsqueeze(0) + if len(qzeros.shape) == 1: + qzeros = qzeros.unsqueeze(0) + qzeros = new_module.pack_tensor_with_numpy(qzeros - 1).contiguous() g_idx = module._op_context.get_g_idx() new_module.qweight = qweight