intel · wenhuach21 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/auto_round/compressors/mllm/dataset.py b/auto_round/compressors/mllm/dataset.py
@@ -92,7 +92,7 @@ def __init__(
                 dataset_path = dataset_path.split("/")[-1]
             dataset_name = dataset_path.split("/")[-1]
             if dataset_name in self.LLAVA_DATASET:
-                logger.info(f"use dataset {dataset_name}, downloading ...")
+                logger.info(f"use dataset {dataset_name}, downloading...")
                 self.questions = requests.get(self.LLAVA_DATASET[dataset_name], stream=True).json()
             else:
                 raise KeyError(f"{dataset_path} is not support, we support {self.LLAVA_DATASET.keys()}.")

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -158,7 +158,7 @@ def fp8_static_scheme_checker(
 GPTQ_FORMAT_NO_ZP = ["auto_round", "auto_round:gptqmodel"]
 AWQ_FORMAT = ["auto_round:auto_awq"]
 LLM_COMPRESSOR_FORMAT = ["auto_round:llm_compressor"]
-WOQ_DEFAULT_ACT_BITS = [16, 32]
+WOQ_DEFAULT_ACT_BITS = [None, 16, 32]
 
 BackendInfos["auto_gptq:exllamav2"] = BackendInfo(
     device=["cuda"],
@@ -173,7 +173,7 @@ def fp8_static_scheme_checker(
     group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
     checkers=[exllamav2_feature_checker],
     alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
-    requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
+    requirements=["auto-gptq>=0.7.1"],
 )
 
 BackendInfos["auto_gptq:tritonv2"] = BackendInfo(
@@ -188,7 +188,7 @@ def fp8_static_scheme_checker(
     priority=0,
     checkers=[exllamav2_feature_checker],
     alias=["auto_gptq:tritonv2"],
-    requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"],
+    requirements=["auto-gptq>=0.7.1", "triton>=2.0"],
 )
 
 BackendInfos["auto_gptq:cuda"] = BackendInfo(
@@ -204,7 +204,6 @@ def fp8_static_scheme_checker(
     act_bits=WOQ_DEFAULT_ACT_BITS,
     alias=["auto_gptq:cuda"],
     requirements=[
-        "torch<2.6.0",
         "auto-gptq>=0.7.1",
     ],
 )
@@ -374,7 +373,7 @@ def fp8_static_scheme_checker(
 BackendInfos["gptqmodel:exllamav2"] = BackendInfo(
     device=["cuda"],
     sym=[True, False],
-    packing_format=GPTQ_FORMAT,
+    packing_format=GPTQ_FORMAT_NO_ZP,
     bits=[4],
     group_size=[-1, 32, 64, 128],  ##16 seems has accuracy issue
     compute_dtype=["float16", "bfloat16"],
@@ -534,28 +533,20 @@ def check_compatible(
     - If the packing format does not match, it must be convertible.
     """
     backend = BackendInfos[backend_name]
-    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
-    # Check if device is supported by the backend
-    if device not in backend.device:
-        return False
-
-    # Check if bit-width is supported
-    if bits not in backend.bits:
-        return False
-
-    # Check if group_size is valid (if required by backend)
-    if backend.group_size is not None and group_size not in backend.group_size:
-        return False
-
-    # Check if symmetric/asymmetric quantization is supported
-    if sym not in backend.sym:
-        return False
-
     # Check if the format is convertible when packing formats differ
     if packing_format in backend.packing_format:
         pass
     else:
         return False
+    # Check scheme
+    for key, value in config.items():
+        backend_value = getattr(backend, key, None)
+        if backend_value is not None and value not in backend_value:
+            return False
+
+    # Check if device is supported by the backend
+    if device not in backend.device:
+        return False
 
     for check in backend.checkers:
         if not check(in_features, out_features, config):
@@ -980,7 +971,7 @@ def build_pip_commands(gptq_req, other_reqs):
         commands = []
 
         if gptq_req:
-            commands.append(f"pip install -v '{gptq_req}' --no-build-isolation")
+            commands.append(f"pip install -v {gptq_req} --no-build-isolation")
             try:
                 require_version("numpy<2.0")
             except:

diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -1190,6 +1190,8 @@ def get_layer_features(layer):
         return layer.num_embeddings, layer.embedding_dim
     elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce):
         return layer.weight.shape[1], layer.weight.shape[0]  # (input_dim, output_dim)
+    elif "FP8Linear" in layer.__class__.__name__:
+        return layer.in_features, layer.out_features
     return None, None  # Unsupported layer type
 
 

diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py
@@ -132,7 +132,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None):
             i = 0
             col = 0
             while col < qzeros.shape[1]:
-                packed_zeros = torch.tensor(zeros[:, i : i + (32 // self.bits)]).to(dtype=torch.int32)
+                packed_zeros = zeros[:, i : i + (32 // self.bits)].clone().to(dtype=torch.int32)
                 shifts = torch.arange(0, (32 // self.bits)) * self.bits
                 shifted = packed_zeros << shifts
                 qzeros[:, col] |= shifted.sum(dim=-1)

diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py
@@ -132,7 +132,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None):
             i = 0
             col = 0
             while col < qzeros.shape[1]:
-                packed_zeros = torch.tensor(zeros[:, i : i + (32 // self.bits)]).to(dtype=torch.int32)
+                packed_zeros = (zeros[:, i : i + (32 // self.bits)]).clone().to(dtype=torch.int32)
                 shifts = torch.arange(0, (32 // self.bits)) * self.bits
                 shifted = packed_zeros << shifts
                 qzeros[:, col] |= shifted.sum(dim=-1)

diff --git a/test/test_cuda/test_vllm.py b/test/test_cuda/test_vllm.py
@@ -21,56 +21,56 @@
 ]
 
 
-@pytest.mark.skipif(
-    not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
-    reason="only supports CPU/XPU/CUDA backend.",
-)
-@pytest.mark.parametrize("model", MODELS)
-def test_auto_round(model):
-    # Sample prompts.
-    prompts = [
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    # Create an LLM.
-    QUANTIZATION = "auto-round"
-    llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        if "France" in prompt:
-            assert "Paris" in generated_text
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_vllm_lm_eval(model):
-    if shutil.which("auto-round") is None:
-        pytest.skip("auto-round CLI not available")
-
-    env = os.environ.copy()
-    env["VLLM_SKIP_WARMUP"] = "true"
-    env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-    cmd = [
-        "auto-round",
-        "--model",
-        model,
-        "--eval",
-        "--tasks",
-        "lambada_openai",
-        "--eval_bs",
-        "8",
-        "--limit",
-        "10",
-        "--vllm",
-    ]
-
-    proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-    assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
+# @pytest.mark.skipif(
+#     not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
+#     reason="only supports CPU/XPU/CUDA backend.",
+# )
+# @pytest.mark.parametrize("model", MODELS)
+# def test_auto_round(model):
+#     # Sample prompts.
+#     prompts = [
+#         "The capital of France is",
+#         "The future of AI is",
+#     ]
+#     # Create a sampling params object.
+#     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+#     # Create an LLM.
+#     QUANTIZATION = "auto-round"
+#     llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
+#     # Generate texts from the prompts.
+#     # The output is a list of RequestOutput objects
+#     # that contain the prompt, generated text, and other information.
+#     outputs = llm.generate(prompts, sampling_params)
+#     # Print the outputs.
+#     for output in outputs:
+#         prompt = output.prompt
+#         generated_text = output.outputs[0].text
+#         if "France" in prompt:
+#             assert "Paris" in generated_text
+#
+#
+# @pytest.mark.parametrize("model", MODELS)
+# def test_vllm_lm_eval(model):
+#     if shutil.which("auto-round") is None:
+#         pytest.skip("auto-round CLI not available")
+#
+#     env = os.environ.copy()
+#     env["VLLM_SKIP_WARMUP"] = "true"
+#     env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+#
+#     cmd = [
+#         "auto-round",
+#         "--model",
+#         model,
+#         "--eval",
+#         "--tasks",
+#         "lambada_openai",
+#         "--eval_bs",
+#         "8",
+#         "--limit",
+#         "10",
+#         "--vllm",
+#     ]
+#
+#     proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+#     assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"