intel · chensuyue · Feb 9, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 27, 2026
diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh
@@ -2,14 +2,14 @@
 set -xe
 
 test_part=$1
-
-# install requirements
-echo "##[group]set up UT env..."
 export TQDM_MININTERVAL=60
+echo "##[group]set up UT env..."
 uv pip install pytest-cov pytest-html
-uv pip install -r /auto-round/test/test_cpu/requirements.txt \
-    --extra-index-url https://download.pytorch.org/whl/cpu
-uv pip install torch==2.8.0 torchvision --index-url https://download.pytorch.org/whl/cpu
+uv pip list
+# workaround for ark test, remove auto_round_kernel_xpu
+package_path=$(uv pip show auto-round-lib | grep Location:|cut -d: -f2)
+rm -rf $package_path/auto_round_kernel/auto_round_kernel_xpu*
+echo "##[endgroup]"
 
 # install latest gguf for ut test
 cd ~ || exit 1

diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 set -xe
 
-# install requirements
 echo "##[group]set up UT env..."
 uv pip install pytest-cov pytest-html
 uv pip list

diff --git a/.azure-pipelines/template/ut-template.yml b/.azure-pipelines/template/ut-template.yml
@@ -53,10 +53,11 @@ steps:
             && uv pip list"
         else 
           docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
-            && uv pip install torch==2.8.0 torchvision --index-url https://download.pytorch.org/whl/cpu \
-            && uv pip install intel-extension-for-pytorch==2.8.0 \
+            && uv pip install torch==2.9.1 torchvision --index-url https://download.pytorch.org/whl/cpu \
+            && uv pip install torch==2.9.1 auto-round-lib \
             && uv pip install -r requirements.txt \
             && uv pip install -r requirements-cpu.txt \
+            && uv pip install -r test/test_cpu/requirements.txt \
             && uv pip list"
         fi
       displayName: "Env Setup"

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
@@ -1,3 +1,3 @@
 numba
 tbb
-intel-extension-for-pytorch
+auto-round-lib
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
@@ -438,73 +438,14 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         quantized_model_path = self.save_folder
 
         autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
-        quantization_config = AutoRoundConfig(backend="ipex")
 
-        model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="cpu", quantization_config=quantization_config
-        )
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_not_convert_modules(self):
-        import requests
-        from PIL import Image
-        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-
-        from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear
-
-        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct-AWQ")
-        quantization_config = AutoRoundConfig()
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
-        )
-        if transformers_version < version.parse("5.0.0"):
-            assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
-            assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
-        else:
-            assert isinstance(model.model.visual.blocks[0].attn.qkv, torch.nn.Linear)
-            assert not isinstance(model.model.visual.merger.mlp[0], QuantLinear)
-        if hasattr(model.model, "language_model"):
-            assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
-        else:
-            assert isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear)
-
-        processor = AutoProcessor.from_pretrained(model_name, size=None)
-        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": image_url,
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            }
-        ]
-
-        # Preparation for inference
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-
-        # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, max_new_tokens=1)
-        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        print(output_text)
-
     def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader):
         model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True

diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py
@@ -41,20 +41,15 @@ def test_4bits_sym(self, dataloader):
 
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False)
 
-        quantization_config = AutoRoundConfig(backend="ipex")
-        model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="cpu", quantization_config=quantization_config
-        )
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "My name is "
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
 
-        model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="cpu", quantization_config=quantization_config, torch_dtype=torch.float16
-        )
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", torch_dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)