From 3285944bc917ff6f659384ad0519a7cd4640ec95 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Tue, 28 Oct 2025 14:01:57 +0800
Subject: [PATCH 1/8] update patches

Signed-off-by: gc-fu <guancheng.fu@intel.com>
---
 vllm/patches/vllm_for_multi_arc.patch | 23452 +++++++-----------------
 1 file changed, 7042 insertions(+), 16410 deletions(-)

diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index e961e2a..25e1071 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -1,5 +1,5 @@
 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
-index b98d42aa7..b2a1ebef2 100644
+index 792f355c4..af2c24c4c 100644
 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
@@ -10,6 +10,107 @@ index b98d42aa7..b2a1ebef2 100644
 +  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,enforce_eager=true,max_num_batched_tokens=4096" \
    --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
    --batch_size "$BATCH_SIZE"
+diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
+index 8c6ef7817..a1de41652 100644
+--- a/.buildkite/release-pipeline.yaml
++++ b/.buildkite/release-pipeline.yaml
+@@ -1,22 +1,24 @@
+ steps:
+   # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+   - label: "Build arm64 wheel - CUDA 12.9"
+-    depends_on: ~
+     id: build-wheel-arm64-cuda-12-9
+     agents:
+       queue: arm64_cpu_queue_postmerge
+     commands:
+       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+       - "mkdir artifacts"
+       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+       - "bash .buildkite/scripts/upload-wheels.sh"
+     env:
+       DOCKER_BUILDKIT: "1"
+ 
++  - block: "Build CUDA 12.8 wheel"
++    key: block-build-cu128-wheel
++
+   - label: "Build wheel - CUDA 12.8"
+-    depends_on: ~
++    depends_on: block-build-cu128-wheel
+     id: build-wheel-cuda-12-8
+     agents:
+       queue: cpu_queue_postmerge
+@@ -28,8 +30,12 @@ steps:
+     env:
+       DOCKER_BUILDKIT: "1"
+ 
+-  - label: "Build wheel - CUDA 12.6"
++  - block: "Build CUDA 12.6 wheel"
++    key: block-build-cu126-wheel
+     depends_on: ~
++
++  - label: "Build wheel - CUDA 12.6"
++    depends_on: block-build-cu126-wheel
+     id: build-wheel-cuda-12-6
+     agents:
+       queue: cpu_queue_postmerge
+@@ -96,6 +102,8 @@ steps:
+     depends_on:
+       - create-multi-arch-manifest
+       - build-wheel-cuda-12-8
++      - build-wheel-cuda-12-6
++      - build-wheel-cuda-12-9
+     id: annotate-release-workflow
+     agents:
+       queue: cpu_queue_postmerge
+diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
+index fde48603a..94e0ac239 100755
+--- a/.buildkite/scripts/annotate-release.sh
++++ b/.buildkite/scripts/annotate-release.sh
+@@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+ To download the wheel:
+ \`\`\`
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+-
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
++aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+ \`\`\`
+ 
+ To download and upload the image:
+ 
+ \`\`\`
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-docker push vllm/vllm-openai:latest-x86_64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-docker push vllm/vllm-openai:latest-aarch64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-
+-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+-docker manifest push vllm/vllm-openai:latest
+-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
++docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
++docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
++docker tag vllm/vllm-openai vllm/vllm-openai:latest
++docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
++docker push vllm/vllm-openai:latest
++docker push vllm/vllm-openai:v${RELEASE_VERSION}
+ \`\`\`
+ EOF 
+\ No newline at end of file
 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
 new file mode 100644
 index 000000000..aef250abe
@@ -388,93 +489,8 @@ index 000000000..eaa2f332a
 +        else
 +          echo "✅ All benchmarks passed"
 +        fi
-diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
-deleted file mode 100644
-index d5c6b8d43..000000000
---- a/.github/workflows/cleanup_pr_body.yml
-+++ /dev/null
-@@ -1,31 +0,0 @@
--name: Cleanup PR Body
--
--on:
--  pull_request_target:
--    types: [opened, reopened, edited]
--
--permissions:
--  pull-requests: write
--
--jobs:
--  update-description:
--    runs-on: ubuntu-latest
--
--    steps:
--      - name: Checkout repository
--        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--
--      - name: Set up Python
--        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
--        with:
--          python-version: '3.12'
--
--      - name: Install Python dependencies
--        run: |
--          python3 -m pip install --upgrade pip
--          python3 -m pip install regex
--
--      - name: Update PR description
--        env:
--          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
-diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
-deleted file mode 100644
-index 16ae1aadb..000000000
---- a/.github/workflows/reminder_comment.yml
-+++ /dev/null
-@@ -1,27 +0,0 @@
--name: PR Reminder Comment Bot
--permissions:
--  pull-requests: write
--on:
--  pull_request_target:
--    types: [opened]
--jobs:
--  pr_reminder:
--    runs-on: ubuntu-latest
--    steps:
--      - name: Remind to run full CI on PR
--        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
--        with:
--          script: |
--            github.rest.issues.createComment({
--              owner: context.repo.owner,
--              repo: context.repo.repo,
--              issue_number: context.issue.number,
--              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
--                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
--                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
--                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
--                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
--                '🚀'
--            })
--        env:
--          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 98ed682fe..5dd6e907c 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -95,6 +95,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
-     NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
-     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
-         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-+    elseif(VLLM_TARGET_DEVICE STREQUAL "xpu")
-+        message(STATUS "Building XPU")
-+        set(VLLM_GPU_LANG "SYCL")
-+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/xpu_extension.cmake)
-     else()
-         return()
-     endif()
 diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
-index c7229dbb8..72531f3fc 100644
+index ba7c733be..61a9eeb91 100644
 --- a/benchmarks/backend_request_func.py
 +++ b/benchmarks/backend_request_func.py
 @@ -18,7 +18,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizer
@@ -486,13648 +502,6499 @@ index c7229dbb8..72531f3fc 100644
  
  
  @dataclass
-diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
-index c597fb106..5bad6645b 100644
---- a/benchmarks/benchmark_serving.py
-+++ b/benchmarks/benchmark_serving.py
-@@ -256,10 +256,11 @@ async def benchmark(
-         raise ValueError(f"Unknown backend: {backend}")
+diff --git a/docker/Dockerfile b/docker/Dockerfile
+index d4761e84f..307e9658f 100644
+--- a/docker/Dockerfile
++++ b/docker/Dockerfile
+@@ -196,7 +196,6 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
+ 
+ # Flag to control whether to use pre-built vLLM wheels
+ ARG VLLM_USE_PRECOMPILED=""
+-ARG VLLM_MAIN_CUDA_VERSION=""
+ 
+ # if USE_SCCACHE is set, use sccache to speed up compilation
+ RUN --mount=type=cache,target=/root/.cache/uv \
+@@ -214,7 +213,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
+         && export SCCACHE_IDLE_TIMEOUT=0 \
+         && export CMAKE_BUILD_TYPE=Release \
+         && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+-        && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
+         && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+         && sccache --show-stats \
+         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
+index ef4223525..ffa7c6ea7 100644
+--- a/docker/Dockerfile.xpu
++++ b/docker/Dockerfile.xpu
+@@ -62,7 +62,7 @@ FROM vllm-base AS vllm-openai
  
-     print("Starting initial single prompt test run...")
-+    # set test_output_len=10 to avoid long prompt test run
-     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-         input_requests[0].prompt,
-         input_requests[0].prompt_len,
--        input_requests[0].expected_output_len,
-+        10,
-         input_requests[0].multi_modal_data,
-     )
+ # install additional dependencies for openai api server
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
++    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] 'modelscope!=1.15.0'
  
-diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
-index 14461121f..e9b9f0b77 100644
---- a/benchmarks/benchmark_throughput.py
-+++ b/benchmarks/benchmark_throughput.py
-@@ -44,6 +44,7 @@ def run_vllm(
-     n: int,
-     engine_args: EngineArgs,
-     disable_detokenize: bool = False,
-+    do_profile: bool = False,
- ) -> tuple[float, Optional[list[RequestOutput]]]:
-     from vllm import LLM, SamplingParams
+ RUN --mount=type=cache,target=/root/.cache/pip \
+     pip uninstall oneccl oneccl-devel -y
+diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
+index 834c03cbe..439e1e0d7 100644
+--- a/docs/features/quantization/fp8.md
++++ b/docs/features/quantization/fp8.md
+@@ -134,4 +134,4 @@ print(result[0].outputs[0].text)
+ ```
  
-@@ -89,10 +90,14 @@ def run_vllm(
-     outputs = None
-     if not use_beam_search:
-         start = time.perf_counter()
-+        if do_profile:
-+            llm.start_profile()
-         outputs = llm.generate(
-             prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
-         )
-         end = time.perf_counter()
-+        if do_profile:
-+            llm.stop_profile()
-     else:
-         assert lora_requests is None, "BeamSearch API does not support LoRA"
-         prompts = [request.prompt for request in requests]
-@@ -410,6 +415,7 @@ def main(args: argparse.Namespace):
-                 args.n,
-                 EngineArgs.from_cli_args(args),
-                 args.disable_detokenize,
-+                args.profile
-             )
-     elif args.backend == "hf":
-         assert args.tensor_parallel_size == 1
-@@ -647,6 +653,10 @@ def create_argument_parser():
-     parser.add_argument(
-         "--num-prompts", type=int, default=1000, help="Number of prompts to process."
-     )
-+    parser.add_argument("--profile",
-+                        action='store_true',
-+                        default=False,
-+                        help="whether run with profiler.")
-     parser.add_argument(
-         "--hf-max-batch-size",
-         type=int,
-diff --git a/cmake/utils.cmake b/cmake/utils.cmake
-index 621179a70..9e1f4e9c7 100644
---- a/cmake/utils.cmake
-+++ b/cmake/utils.cmake
-@@ -445,7 +445,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
-     GPU
-     "WITH_SOABI"
-     "DESTINATION;LANGUAGE;USE_SABI"
--    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
-+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES;LINK_FLAGS")
+ !!! warning
+-    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
++    Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device.
+diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
+index db3dd2c25..7d3577b14 100644
+--- a/docs/models/supported_models.md
++++ b/docs/models/supported_models.md
+@@ -340,6 +340,7 @@ th {
+ | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
++| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
+ | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
+ | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+@@ -667,6 +668,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
+ | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ |
+ | `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
+ | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
+ | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+@@ -757,8 +761,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
+     Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
  
-   # Add hipify preprocessing step when building with HIP/ROCm.
-   if (GPU_LANGUAGE STREQUAL "HIP")
-@@ -487,6 +487,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+ !!! note
+-    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+-    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
++    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
  
-   target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+ #### Transcription
  
-+  if (GPU_LANGUAGE STREQUAL "SYCL")
-+    target_compile_options(${GPU_MOD_NAME} PRIVATE ${GPU_COMPILE_FLAGS})
-+    target_link_options(${GPU_MOD_NAME} PRIVATE ${GPU_LINK_FLAGS})
-+  endif()
-+
-   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
-   # dependencies that are not necessary and may not be installed.
-   if (GPU_LANGUAGE STREQUAL "CUDA")
-diff --git a/cmake/xpu_extension.cmake b/cmake/xpu_extension.cmake
+diff --git a/examples/bmg/reasoning.py b/examples/bmg/reasoning.py
 new file mode 100644
-index 000000000..fd671a6bf
+index 000000000..04f91786e
 --- /dev/null
-+++ b/cmake/xpu_extension.cmake
-@@ -0,0 +1,62 @@
-+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-+
-+#
-+# Define environment variables for special configurations
-+#
-+# TODO: detect Intel GPU Architecture(PVC or Arc) to add AOT flag.
++++ b/examples/bmg/reasoning.py
+@@ -0,0 +1,27 @@
++from openai import OpenAI
 +
-+#
-+# Check the compile flags
-+#
-+# append_cmake_prefix_path("intel_extension_for_pytorch" "intel_extension_for_pytorch.cmake_prefix_path")
-+# find_package(IPEX REQUIRED)
-+# IPEX will overwrite TORCH_LIBRARIES, so re-add torch_python lib.
-+append_torchlib_if_found(torch_python)
-+# include_directories(${IPEX_INCLUDE_DIRS})
-+set(CMPLR_ROOT $ENV{CMPLR_ROOT})
-+set(CMAKE_CXX_COMPILER icpx)
-+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
-+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-+set(VLLM_EXTRA_INCLUDE_DIRECTORIES ${CMPLR_ROOT}/include/sycl)
-+
-+list(APPEND VLLM_GPU_FLAGS "-fsycl" "-fsycl-targets=spir64")
-+list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64")
-+list(APPEND VLLM_LINK_LIBRARIES "sycl" "OpenCL" "pthread" "m" "dl" "dnnl" )
-+
-+#
-+# Define extension targets
-+#
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://0.0.0.0:8000/v1"
 +
-+#
-+# _C extension
-+#
-+set(VLLM_EXT_SRC
-+    "csrc/xpu/activation_xpu.cpp"
-+    "csrc/xpu/attention_xpu.cpp"
-+    "csrc/xpu/attention_xpu_fp8.cpp"
-+    "csrc/xpu/cache_ops_xpu.cpp"
-+    "csrc/xpu/cache_ops_xpu_fp8.cpp"
-+    "csrc/xpu/gemm_kernels_xpu.cpp"
-+    "csrc/xpu/layernorm_xpu.cpp"
-+    "csrc/xpu/pos_encoding_xpu.cpp"
-+    "csrc/xpu/utils.cpp"
-+    "csrc/xpu/fused_moe.cpp"
-+    "csrc/xpu/pybind.cpp")
-+
-+define_gpu_extension_target(
-+    _C
-+    DESTINATION vllm
-+    LANGUAGE ${VLLM_GPU_LANG}
-+    SOURCES ${VLLM_EXT_SRC}
-+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-+    LINK_FLAGS ${VLLM_GPU_LINK_FLAGS}
-+    ARCHITECTURES ${VLLM_GPU_ARCHES}
-+    INCLUDE_DIRECTORIES ${VLLM_EXTRA_INCLUDE_DIRECTORIES}
-+    LIBRARIES ${VLLM_LINK_LIBRARIES}
-+    WITH_SOABI
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
 +)
 +
-+add_custom_target(default_xpu)
-+message(STATUS "Enabling C extension.")
-+add_dependencies(default_xpu _C)
-+
-diff --git a/csrc/xpu/activation_xpu.cpp b/csrc/xpu/activation_xpu.cpp
-new file mode 100644
-index 000000000..6f98ddbb3
---- /dev/null
-+++ b/csrc/xpu/activation_xpu.cpp
-@@ -0,0 +1,278 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+// clang-format on
-+#include "xpu_types.h"
-+
-+#include <torch/extension.h>
-+#include "utils.h"
-+
-+template <typename T>
-+__inline__ T silu_xpu(const T& x) {
-+  // x * sigmoid(x)
-+  return (T)(((float)x) / (1.0f + sycl::exp((float)-x)));
-+}
-+
-+template<typename T>
-+__inline__ T gelu_xpu(const T& x) {
-+  // Equivalent to PyTorch GELU with 'none' approximation.
-+  // Refer to:
-+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
-+  const float f = (float) x;
-+  constexpr float ALPHA = M_SQRT1_2;
-+  return (T) (f * 0.5f * (1.0f + sycl::erf(f * ALPHA)));
-+}
-+
-+template<typename T>
-+__inline__ T gelu_tanh_xpu(const T& x) {
-+  const float f = (float) x;
-+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
-+  constexpr float KAPPA = 0.044715;
-+  float x_cube = f * f * f;
-+  float inner = BETA * (f + KAPPA * x_cube);
-+  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
-+}
-+
-+template <typename scalar_t>
-+void silu_and_mul_kernel(
-+    scalar_t* __restrict__ out, // [..., d]
-+    const scalar_t* __restrict__ input, // [..., 2, d]
-+    const int d,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
-+       idx += item_ct1.get_local_range(2)) {
-+    const scalar_t x = input[token_idx * 2 * d + idx];
-+    const scalar_t y = input[token_idx * 2 * d + d + idx];
-+    out[token_idx * d + idx] = silu_xpu(x) * y;
-+  }
-+}
-+
-+template <typename scalar_t>
-+void gelu_and_mul_kernel(
-+    scalar_t* __restrict__ out, // [..., d]
-+    const scalar_t* __restrict__ input, // [..., 2, d]
-+    const int d,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
-+       idx += item_ct1.get_local_range(2)) {
-+    const scalar_t x = input[token_idx * 2 * d + idx];
-+    const scalar_t y = input[token_idx * 2 * d + d + idx];
-+    out[token_idx * d + idx] = gelu_xpu(x) * y;
-+  }
-+}
-+
-+template <typename scalar_t>
-+void gelu_tanh_and_mul_kernel(
-+    scalar_t* __restrict__ out, // [..., d]
-+    const scalar_t* __restrict__ input, // [..., 2, d]
-+    const int d,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
-+       idx += item_ct1.get_local_range(2)) {
-+    const scalar_t x = input[token_idx * 2 * d + idx];
-+    const scalar_t y = input[token_idx * 2 * d + d + idx];
-+    out[token_idx * d + idx] = gelu_tanh_xpu(x) * y;
-+  }
-+}
-+
-+
-+template <typename scalar_t>
-+void call_silu_and_mul_kernel(
-+    int num_tokens,
-+    int d,
-+    const scalar_t* __restrict__ input,
-+    scalar_t* __restrict__ output) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          silu_and_mul_kernel<sycl_t>(
-+              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
-+        });
-+  });
-+}
++models = client.models.list()
++model = models.data[0].id
 +
-+template <typename scalar_t>
-+void call_gelu_and_mul_kernel(
-+    int num_tokens,
-+    int d,
-+    const scalar_t* __restrict__ input,
-+    scalar_t* __restrict__ output) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          gelu_and_mul_kernel<sycl_t>(
-+              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
-+        });
-+  });
-+}
-+
-+template <typename scalar_t>
-+void call_gelu_tanh_and_mul_kernel(
-+    int num_tokens,
-+    int d,
-+    const scalar_t* __restrict__ input,
-+    scalar_t* __restrict__ output) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          gelu_tanh_and_mul_kernel<sycl_t>(
-+              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
-+        });
-+  });
-+}
++# Round 1
++messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
++# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
++# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
++# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
++response = client.chat.completions.create(model=model, messages=messages)
 +
-+void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
-+  int num_tokens = input.numel() / input.size(-1);
-+  int d = input.size(-1) / 2;
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_silu_and_mul_kernel", [&] {
-+        call_silu_and_mul_kernel(
-+            num_tokens,
-+            d,
-+            input.data_ptr<scalar_t>(),
-+            out.data_ptr<scalar_t>());
-+      });
-+}
++reasoning_content = response.choices[0].message.reasoning_content
++content = response.choices[0].message.content
 +
-+// Element-wise activation kernel template.
-+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
-+void activation_kernel(
-+    scalar_t* __restrict__ out, // [..., d]
-+    const scalar_t* __restrict__ input, // [..., d]
-+    const int d,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
-+       idx += item_ct1.get_local_range(2)) {
-+    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-+    out[token_idx * d + idx] = ACT_FN(x);
-+  }
-+}
++print("reasoning_content:", reasoning_content)
++print("content:", content)
 +
-+template <typename T>
-+__inline__ T gelu_new_kernel(const T& x) {
-+  const float x3 = (float)(x * x * x);
-+  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
-+  return ((T)0.5) * x * (((T)1.0) + t);
-+}
+diff --git a/examples/bmg/tooling.py b/examples/bmg/tooling.py
+new file mode 100644
+index 000000000..bf8375831
+--- /dev/null
++++ b/examples/bmg/tooling.py
+@@ -0,0 +1,37 @@
++import json
 +
-+template <typename T>
-+__inline__ T gelu_fast_kernel(const T& x) {
-+  const float f = (float)x;
-+  const T t =
-+      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
-+  return ((T)0.5) * x * (((T)1.0) + t);
-+}
++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
++
++def get_weather(location: str, unit: str):
++    return f"Getting the weather for {location} in {unit}..."
++tool_functions = {"get_weather": get_weather}
++
++tools = [{
++    "type": "function",
++    "function": {
++        "name": "get_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
++                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
++            },
++            "required": ["location", "unit"]
++        }
++    }
++}]
++
++response = client.chat.completions.create(
++    model=client.models.list().data[0].id,
++    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
++    tools=tools,
++    temperature=0,
++    tool_choice="auto"
++)
 +
-+template <typename scalar_t>
-+void call_gelu_new_activation_kernel(torch::Tensor& out, torch::Tensor& input) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int d = input.size(-1);
-+  int64_t num_tokens = input.numel() / d;
-+  auto out_ptr = out.data_ptr<scalar_t>();
-+  auto input_ptr = input.data_ptr<scalar_t>();
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          activation_kernel<sycl_t, gelu_new_kernel>(
-+              (sycl_t* __restrict__)out_ptr,
-+              (const sycl_t* __restrict__)input_ptr,
-+              d,
-+              item_ct1);
-+        });
-+  });
-+}
++tool_call = response.choices[0].message.tool_calls[0].function
++print(f"Function called: {tool_call.name}")
++print(f"Arguments: {tool_call.arguments}")
++print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}")                                                                               30,22         Bot
++
+diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
+index 36d805a32..2a4233b6a 100644
+--- a/examples/offline_inference/data_parallel.py
++++ b/examples/offline_inference/data_parallel.py
+@@ -96,6 +96,13 @@ def parse_args():
+         "--quantization",
+         type=str,
+     )
++    parser.add_argument(
++        "--disable-expert-parallel",
++        dest="enable_expert_parallel",
++        action="store_false",
++        help="Disable expert parallel (default: enabled).",
++    )
++    parser.set_defaults(enable_expert_parallel=True)
+     return parser.parse_args()
+ 
+ 
+@@ -108,6 +115,7 @@ def main(
+     dp_master_port,
+     GPUs_per_dp_rank,
+     enforce_eager,
++    enable_expert_parallel,
+     trust_remote_code,
+     max_num_seqs,
+     max_model_len,
+@@ -162,7 +170,7 @@ def main(
+         model=model,
+         tensor_parallel_size=GPUs_per_dp_rank,
+         enforce_eager=enforce_eager,
+-        enable_expert_parallel=True,
++        enable_expert_parallel=enable_expert_parallel,
+         trust_remote_code=trust_remote_code,
+         max_num_seqs=max_num_seqs,
+         max_model_len=max_model_len,
+@@ -222,6 +230,7 @@ if __name__ == "__main__":
+                 dp_master_port,
+                 tp_size,
+                 args.enforce_eager,
++                args.enable_expert_parallel,
+                 args.trust_remote_code,
+                 args.max_num_seqs,
+                 args.max_model_len,
+diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
+index b104113b8..58fb423e8 100644
+--- a/examples/offline_inference/vision_language.py
++++ b/examples/offline_inference/vision_language.py
+@@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
+     )
+ 
+ 
++# Dots-OCR
++def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
++    assert modality == "image"
 +
-+template <typename scalar_t>
-+void call_gelu_fast_activation_kernel(
-+    torch::Tensor& out,
-+    torch::Tensor& input) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int d = input.size(-1);
-+  int64_t num_tokens = input.numel() / d;
-+  auto out_ptr = out.data_ptr<scalar_t>();
-+  auto input_ptr = input.data_ptr<scalar_t>();
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          activation_kernel<sycl_t, gelu_fast_kernel>(
-+              (sycl_t* __restrict__)out_ptr,
-+              (const sycl_t* __restrict__)input_ptr,
-+              d,
-+              item_ct1);
-+        });
-+  });
-+}
++    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
++    engine_args = EngineArgs(
++        model="rednote-hilab/dots.ocr",
++        limit_mm_per_prompt={modality: 1},
++        trust_remote_code=True,
++    )
 +
-+void gelu_new(torch::Tensor& out, torch::Tensor& input) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      out.scalar_type(), "call_gelu_new_activation_kernel", [&] {
-+        call_gelu_new_activation_kernel<scalar_t>(out, input);
-+      });
-+}
++    return ModelRequestData(
++        engine_args=engine_args,
++        prompts=prompts,
++    )
 +
-+void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      out.scalar_type(), "call_gelu_fast_activation_kernel", [&] {
-+        call_gelu_fast_activation_kernel<scalar_t>(
-+            out, input);
-+      });
-+}
 +
-+void gelu_and_mul(
-+  torch::Tensor& out,      // [..., d]
-+  torch::Tensor& input)    // [..., 2 * d]
-+{
-+    int num_tokens = input.numel() / input.size(-1);
-+  int d = input.size(-1) / 2;
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_gelu_and_mul_kernel", [&] {
-+        call_gelu_and_mul_kernel(
-+            num_tokens,
-+            d,
-+            input.data_ptr<scalar_t>(),
-+            out.data_ptr<scalar_t>());
-+      });
-+}
+ def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
+     assert modality == "image"
+ 
+@@ -1431,7 +1448,9 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
+             "max_pixels": 1280 * 28 * 28,
+             "fps": 1,
+         },
+-        limit_mm_per_prompt={modality: 1},
++        limit_mm_per_prompt={"image": 1},
++        enforce_eager=True,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+     )
+ 
+     if modality == "image":
+@@ -1497,6 +1516,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
+     )
+ 
+ 
++# Qwen3-VL-Dense
++def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
++    model_name = "Qwen/Qwen3-VL-4B-Instruct"
 +
-+void gelu_tanh_and_mul(
-+  torch::Tensor& out,      // [..., d]
-+  torch::Tensor& input)    // [..., 2 * d]
-+{
-+    int num_tokens = input.numel() / input.size(-1);
-+  int d = input.size(-1) / 2;
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_gelu_tanh_and_mul_kernel", [&] {
-+        call_gelu_tanh_and_mul_kernel(
-+            num_tokens,
-+            d,
-+            input.data_ptr<scalar_t>(),
-+            out.data_ptr<scalar_t>());
-+      });
-+}
-\ No newline at end of file
-diff --git a/csrc/xpu/attention_generic.h b/csrc/xpu/attention_generic.h
-new file mode 100644
-index 000000000..ab3688c82
---- /dev/null
-+++ b/csrc/xpu/attention_generic.h
-@@ -0,0 +1,64 @@
-+/*
-+ * Copyright (c) 2023, The vLLM team.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+#pragma once
-+
-+#include <dpct/dpct.hpp>
-+#include <stdint.h>
-+#include <sycl/sycl.hpp>
-+
-+namespace vllm {
-+
-+// A vector type to store Q, K, V elements.
-+template <typename T, int VEC_SIZE>
-+struct Vec {};
-+
-+// A vector type to store FP32 accumulators.
-+template <typename T>
-+struct FloatVec {};
-+
-+// Template vector operations.
-+template <typename Acc, typename A, typename B>
-+inline Acc mul(A a, B b);
-+
-+template <typename T>
-+inline float sum(T v);
-+
-+template <typename T>
-+inline float dot(T a, T b) {
-+  return sum(mul<T, T, T>(a, b));
-+}
++    engine_args = EngineArgs(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=5,
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++            "fps": 1,
++        },
++        limit_mm_per_prompt={modality: 1},
++    )
 +
-+template <typename A, typename T>
-+inline float dot(T a, T b) {
-+  return sum(mul<A, T, T>(a, b));
-+}
++    if modality == "image":
++        placeholder = "<|image_pad|>"
++    elif modality == "video":
++        placeholder = "<|video_pad|>"
 +
-+template <typename T>
-+inline void zero(T& dst) {
-+  constexpr int WORDS = (sizeof(T) / 4) == 0 ? 1 : (sizeof(T) / 4);
-+  union {
-+    T raw;
-+    uint32_t words[WORDS];
-+  } tmp;
-+
-+#pragma unroll
-+  for (int ii = 0; ii < WORDS; ++ii) {
-+    tmp.words[ii] = 0u;
-+  }
-+  dst = tmp.raw;
-+}
++    prompts = [
++        (
++            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
++            f"{question}<|im_end|>\n"
++            "<|im_start|>assistant\n"
++        )
++        for question in questions
++    ]
 +
-+} // namespace vllm
-\ No newline at end of file
-diff --git a/csrc/xpu/attention_xpu.cpp b/csrc/xpu/attention_xpu.cpp
-new file mode 100644
-index 000000000..97d5c0c21
---- /dev/null
-+++ b/csrc/xpu/attention_xpu.cpp
-@@ -0,0 +1,3031 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+#include <ext/intel/esimd.hpp>
-+
-+// clang-format on
-+#include <float.h>
-+#include <torch/extension.h>
-+#include <stdexcept>
-+#include "utils.h"
-+#include "xpu_types.h"
-+// #include "dtype_bfloat16.dp.hpp"
-+#include "dtype_float16.h"
-+#include "dtype_float32.h"
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+#include <c10/xpu/XPUStream.h>
-+#endif
-+
-+#include <functional>
-+// #include <ipex.h>
-+
-+#define WARP_SIZE 32
-+#define MAX(a, b) ((a) > (b) ? (a) : (b))
-+#define MIN(a, b) ((a) < (b) ? (a) : (b))
-+#define DIVIDE_ROUND_UP(a, b) (((a) + (b)-1) / (b))
-+using namespace sycl::ext::intel::esimd;
-+
-+template<typename T>
-+static inline T attn_softcapping(T qk, float attn_logit_softcapping) {
-+    qk = qk / attn_logit_softcapping;
-+    qk = (sycl::exp(qk) - sycl::exp(-qk)) / (sycl::exp(qk) + sycl::exp(-qk));
-+    qk = qk * attn_logit_softcapping;
-+    return qk;
-+}
++    return ModelRequestData(
++        engine_args=engine_args,
++        prompts=prompts,
++    )
 +
-+template <typename T>
-+struct Float_Trait {
-+  using Type = T;
-+};
-+
-+template <>
-+struct Float_Trait<c10::Half> {
-+  using Type = uint16_t;
-+};
-+
-+template <>
-+struct Float_Trait<c10::BFloat16> {
-+  using Type = sycl::ext::oneapi::bfloat16;
-+};
-+
-+namespace vllm {
-+
-+// Q*K^T operation.
-+template <int THREAD_GROUP_SIZE, typename Vec, int N>
-+inline float qk_dot_(
-+    const Vec* q,
-+    const Vec* k,
-+    const sycl::nd_item<3>& item_ct1) {
-+  using A_vec = typename FloatVec<Vec>::Type;
-+  // Compute the parallel products for Q*K^T (treat vector lanes separately).
-+  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
-+#pragma unroll
-+  for (int ii = 1; ii < N; ++ii) {
-+    qk_vec = fma(q[ii], k[ii], qk_vec);
-+  }
-+
-+  // Finalize the reduction across lanes.
-+  float qk = sum(qk_vec);
-+#pragma unroll
-+  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
-+    
-+    qk += dpct::permute_sub_group_by_xor(
-+        item_ct1.get_sub_group(), qk, mask);
-+  }
-+  return qk;
-+}
 +
-+template <typename T, int THREAD_GROUP_SIZE>
-+struct Qk_dot {
-+  template <typename Vec, int N>
-+  static inline float dot(
-+      const Vec* q,
-+      const Vec* k,
-+      const sycl::nd_item<3>& item_ct1) {
-+    return qk_dot_<THREAD_GROUP_SIZE, Vec, N>(q, k, item_ct1);
-+  }
-+};
-+
-+template <int NUM_WARPS>
-+inline float block_sum(
-+    float* red_smem,
-+    float sum,
-+    const sycl::nd_item<3>& item_ct1) {
-+  // Decompose the thread index into warp / lane.
-+  int warp = item_ct1.get_local_id(2) / WARP_SIZE;
-+  int lane = item_ct1.get_local_id(2) % WARP_SIZE;
-+
-+  // Compute the sum per warp.
-+#pragma unroll
-+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-+    
-+    /*
-+    DPCT1096:42: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    sum += dpct::permute_sub_group_by_xor(
-+        item_ct1.get_sub_group(), sum, mask);
-+  }
-+
-+  // Warp leaders store the data to shared memory.
-+  if (lane == 0) {
-+    red_smem[warp] = sum;
-+  }
-+
-+  // Make sure the data is in shared memory.
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
++# Qwen3-VL-MOE
++def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
++    model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 +
-+  // The warps compute the final sums.
-+  if (lane < NUM_WARPS) {
-+    sum = red_smem[lane];
-+  }
++    engine_args = EngineArgs(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=5,
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++            "fps": 1,
++        },
++        limit_mm_per_prompt={modality: 1},
++    )
 +
-+  // Parallel reduction inside the warp.
-+#pragma unroll
-+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-+    
-+    /*
-+    DPCT1096:43: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    sum += dpct::permute_sub_group_by_xor(
-+        item_ct1.get_sub_group(), sum, mask);
-+  }
-+
-+  // Broadcast to other threads.
-+  
-+  /*
-+  DPCT1096:44: The right-most dimension of the work-group used in the SYCL
-+  kernel that calls this function may be less than "32". The function
-+  "dpct::select_from_sub_group" may return an unexpected result on the CPU
-+  device. Modify the size of the work-group to ensure that the value of the
-+  right-most dimension is a multiple of "32".
-+  */
-+  return dpct::select_from_sub_group(
-+        item_ct1.get_sub_group(), sum, 0);
-+}
++    if modality == "image":
++        placeholder = "<|image_pad|>"
++    elif modality == "video":
++        placeholder = "<|video_pad|>"
 +
-+template <typename scalar_t, int GS, int HD>
-+void context_attention_kernel_v1_reshaped(
-+    void* query, void* key, void* value, const void* block_tables,
-+    const float scale, const void* query_start_loc, const void* seq_lens,
-+    const void* context_lens, const int block_size,
-+    // const int x,  // x in kv_cache
-+    void* out,    // output
-+    const int block_table_stride_batch, const int block_table_stride_seq,
-+    const int query_stride_bs, const int query_stride_head,
-+    const int query_stride_dim, const int k_cache_stride_tokens,
-+    const int k_cache_stride_head, const int k_cache_stride_block_size,
-+    const int k_cache_stride_dim,
-+    const int v_cache_stride_tokens, const int v_cache_stride_head,
-+    const int v_cache_stride_block_size, const int v_cache_stride_dim,
-+    const int out_stride_tokens, const int out_stride_head,
-+    const int num_queries_per_kv, const int max_input_length,
-+    const int batch_size, const int num_heads) {
-+  static_assert(GS * HD * sizeof(scalar_t) * 2 < 64 * 1024);
-+
-+  const size_t key_slm_offset = 0;
-+  const size_t value_slm_offset = GS * HD * sizeof(scalar_t);
-+  sycl::queue& queue = vllm::xpu::vllmGetQueue();
-+
-+  // Get the maximum seq_lens
-+  sycl::range<3> global_size(batch_size, num_heads,
-+                             (max_input_length + GS - 1) / GS * GS);
-+  sycl::range<3> local_size(1, 1, GS);
-+
-+  auto cgf = [&](sycl::handler& handle) {
-+    handle.parallel_for(
-+        sycl::nd_range<3>(global_size, local_size),
-+        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+          slm_init<GS * HD * sizeof(scalar_t) * 2>();
-+
-+          const size_t bsz_idx = item.get_global_id(0);
-+          const size_t head_idx = item.get_global_id(1);
-+          // Assuming we have 32 query head and 8 kv_heads. Then
-+          // num_queries_per_group should be 4 For head_idx 13, then
-+          // kv_head_idx = 13 / 4 = 3, which is correct
-+          const size_t kv_head_idx = head_idx / num_queries_per_kv;
-+          const int32_t seq_idx = item.get_global_id(2);
-+          const size_t gid = item.get_group(2);
-+          const size_t tid = item.get_local_id(2);
-+
-+          // const int64_t * seq_len = (const int64_t *) seq_lens;
-+          const int32_t* seq_len = (const int32_t*)seq_lens;
-+          int32_t seq_bound = seq_len[bsz_idx];
-+
-+          const int32_t* query_loc = (const int32_t*)query_start_loc;
-+          // There is a possibility that the current token index pass
-+          // over the seq_len, therefore: token_idx is the position in
-+          // the query
-+          int32_t token_idx =
-+              query_loc[bsz_idx] + std::min(seq_idx, seq_bound - 1);
-+
-+          const int32_t* context_len_pointer = (const int32_t*)context_lens;
-+
-+          const int* block_tables_ptr = (const int*)block_tables;
-+          const int* block_table =
-+              block_tables_ptr + bsz_idx * block_table_stride_batch;
-+          // I guess this context_len should be 0...
-+          const int32_t context_len = context_len_pointer[bsz_idx];
-+
-+          // Position in the sequence
-+          // context + seq_idx
-+          // const int32_t token_position =
-+          //     context_len + std::min(seq_idx, seq_bound - 1);
-+          const int32_t token_position = context_len + seq_idx;
-+
-+          const scalar_t* query_head = (const scalar_t*)query +
-+                                       token_idx * query_stride_bs +
-+                                       head_idx * query_stride_head;
-+          // Target output
-+          scalar_t* out_head =
-+              (scalar_t*)out +
-+              (query_loc[bsz_idx] + seq_idx) * out_stride_tokens +
-+              head_idx * out_stride_head;
-+
-+          int32_t context_groups = context_len / GS;
-+
-+          // Each token load its query_row
-+          simd<scalar_t, HD> query_row =
-+              block_load<scalar_t, HD>(query_head) * scale;
-+          simd<scalar_t, HD> accv = 0;
-+          simd<scalar_t, GS> softmaxv = 0;
-+          scalar_t max_attn = -sycl::detail::max_v<scalar_t>();
-+
-+          // ################# Handle n * GS context part ######################
-+          int32_t n = context_len / GS;
-+          int32_t context_offset = context_len % GS;
-+
-+          for (int32_t group = 0; group < n; ++group) {
-+            size_t target_key_position = group * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            // Now key shape is [num_blocks, num_heads, block_size, head_dim]
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
-+            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t), key_row);
-+
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot * v_cache_stride_block_size;
-+            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
-+            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
-+                            value_row);
-+            barrier();
-+
-+            // Calculate QK^T for this group...
-+            simd<scalar_t, GS> attnv;
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              attnv[r] = attn;
-+            }
-+            scalar_t new_max_attn =
-+                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
-+            scalar_t attn_exp = exp(max_attn - new_max_attn);
-+            accv = accv * attn_exp;
-+            softmaxv = softmaxv * attn_exp;
-+            max_attn = new_max_attn;
-+            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
-+#pragma unorll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              accv += value_row * attn_expv[r];
-+            }
-+            softmaxv += attn_expv;
-+            barrier();
-+          }
-+
-+          // ########## End for handling context n * GS part ###########
-+
-+          // ########## Handle n * GS ################
-+          for (size_t group = 0; group < gid; ++group) {
-+            // 1. begins to load each position's key and value
-+            size_t target_key_position = context_len + group * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
-+            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
-+                            key_row);
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot * v_cache_stride_block_size;
-+            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
-+            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
-+                            value_row);
-+            barrier();
-+            simd<scalar_t, GS> attnv;
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              attnv[r] = attn;
-+            }
++    prompts = [
++        (
++            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
++            f"{question}<|im_end|>\n"
++            "<|im_start|>assistant\n"
++        )
++        for question in questions
++    ]
 +
-+            scalar_t new_max_attn =
-+                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
-+            scalar_t attn_exp = exp(max_attn - new_max_attn);
-+            accv = accv * attn_exp;
-+
-+            softmaxv = softmaxv * attn_exp;
-+            max_attn = new_max_attn;
-+            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              accv += value_row * attn_expv[r];
-+            }
-+            softmaxv += attn_expv;
-+            barrier();
-+          }
-+
-+          // ######### End of handle n * GS part ##########
-+
-+          // ################ Handle offset part ####################
-+          scalar_t softmax =
-+              sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, GS>(
-+                  softmaxv);
-+
-+          // ########### handle context offset ############
-+          if (tid < context_offset) {
-+            size_t target_key_position = n * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
-+            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
-+                            key_row);
-+
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head +
-+                which_slot * v_cache_stride_block_size;
-+            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
-+            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
-+                            value_row);
-+          }
-+
-+          barrier();
-+
-+          if (token_position < seq_bound) {
-+#pragma unroll
-+            for (size_t r = 0; r < context_offset; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              if (attn <= max_attn) {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(attn - max_attn);
-+                accv += value_row * attn_exp;
-+                softmax += attn_exp;
-+              } else {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(max_attn - attn);
-+                accv = accv * attn_exp + value_row;
-+                softmax = softmax * attn_exp + 1;
-+                max_attn = attn;
-+              }
-+            }
-+          }
-+          barrier();
-+
-+          // ############## handle seq offset #################
-+          if (token_position < seq_bound) {
-+            const int64_t which_block =
-+                static_cast<int64_t>(token_position / block_size);
-+            const int64_t which_slot =
-+                static_cast<int64_t>(token_position % block_size);
-+
-+            const int64_t physical_block_number =
-+                static_cast<int64_t>(block_table[which_block]);
-+
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
-+            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
-+                            key_row);
-+
-+            // [num_blocks, num_kv_heads, head_size, block_size]
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head +
-+                which_slot * v_cache_stride_block_size;
-+            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
-+            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
-+                            value_row);
-+          }
-+          barrier();
-+
-+          if (token_position < seq_bound) {
-+            for (size_t r = 0; r <= tid; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              if (attn <= max_attn) {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(attn - max_attn);
-+                accv += value_row * attn_exp;
-+                softmax += attn_exp;
-+              } else {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(max_attn - attn);
-+                accv = accv * attn_exp + value_row;
-+                softmax = softmax * attn_exp + 1;
-+                max_attn = attn;
-+              }
-+            }
++    return ModelRequestData(
++        engine_args=engine_args,
++        prompts=prompts,
++    )
 +
-+            if (softmax > 0) {
-+              simd<scalar_t, HD> result = accv / softmax;
-+              block_store(out_head, result);
-+            } else {
-+              simd<scalar_t, HD> result = 0;
-+              block_store(out_head, result);
-+            }
-+          }
-+          // ######## Ending of handling seq offset ##########
-+        });
-+  };
-+  queue.submit(cgf);
-+}
 +
-+// How about implement a first edition that can be used with non-chunked
-+// prefill requests, so that we can make sure the reference for heads is
-+// correct
-+template <typename scalar_t, int GS, int HD>
-+void context_attention_kernel_v1(
-+    void* query, void* key, void* value, const void* block_tables,
-+    const float scale, const void* query_start_loc, const void* seq_lens,
-+    const void* context_lens, const int block_size,
-+    const int x,  // x in kv_cache
-+    void* out,    // output
-+    const int block_table_stride_batch, const int block_table_stride_seq,
-+    const int query_stride_bs, const int query_stride_head,
-+    const int query_stride_dim, const int k_cache_stride_tokens,
-+    const int k_cache_stride_head, const int k_cache_stride_dim,
-+    const int k_cache_stride_block_size, const int k_cache_stride_x,
-+    const int v_cache_stride_tokens, const int v_cache_stride_head,
-+    const int v_cache_stride_dim, const int v_cache_stride_block_size,
-+    const int out_stride_tokens, const int out_stride_head,
-+    const int num_queries_per_kv, const int max_input_length,
-+    const int batch_size, const int num_heads) {
-+  static_assert(GS * HD * sizeof(scalar_t) * 2 < 64 * 1024);
-+
-+  const size_t key_slm_offset = 0;
-+  const size_t value_slm_offset = GS * HD * sizeof(scalar_t);
-+  sycl::queue& queue = vllm::xpu::vllmGetQueue();
-+
-+  // Get the maximum seq_lens
-+  sycl::range<3> global_size(batch_size, num_heads,
-+                             (max_input_length + GS - 1) / GS * GS);
-+  sycl::range<3> local_size(1, 1, GS);
-+
-+  auto cgf = [&](sycl::handler& handle) {
-+    handle.parallel_for(
-+        sycl::nd_range<3>(global_size, local_size),
-+        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+          slm_init<GS * HD * sizeof(scalar_t) * 2>();
-+
-+          const size_t bsz_idx = item.get_global_id(0);
-+          const size_t head_idx = item.get_global_id(1);
-+          // Assuming we have 32 query head and 8 kv_heads. Then
-+          // num_queries_per_group should be 4 For head_idx 13, then
-+          // kv_head_idx = 13 / 4 = 3, which is correct
-+          const size_t kv_head_idx = head_idx / num_queries_per_kv;
-+          const int32_t seq_idx = item.get_global_id(2);
-+          const size_t gid = item.get_group(2);
-+          const size_t tid = item.get_local_id(2);
-+
-+          // const int64_t * seq_len = (const int64_t *) seq_lens;
-+          const int32_t* seq_len = (const int32_t*)seq_lens;
-+          int32_t seq_bound = seq_len[bsz_idx];
-+
-+          const int32_t* query_loc = (const int32_t*)query_start_loc;
-+          // There is a possibility that the current token index pass
-+          // over the seq_len, therefore: token_idx is the position in
-+          // the query
-+          int32_t token_idx =
-+              query_loc[bsz_idx] + std::min(seq_idx, seq_bound - 1);
-+
-+          const int32_t* context_len_pointer = (const int32_t*)context_lens;
-+
-+          const int* block_tables_ptr = (const int*)block_tables;
-+          const int* block_table =
-+              block_tables_ptr + bsz_idx * block_table_stride_batch;
-+          // I guess this context_len should be 0...
-+          const int32_t context_len = context_len_pointer[bsz_idx];
-+
-+          // Position in the sequence
-+          // context + seq_idx
-+          // const int32_t token_position =
-+          //     context_len + std::min(seq_idx, seq_bound - 1);
-+          const int32_t token_position = context_len + seq_idx;
-+
-+          // static const CONSTANT char FMT[] =
-+          //     "Invoke target function...\n ";
-+
-+          // sycl::ext::oneapi::experimental::printf(FMT);
-+          // static const CONSTANT char FMT[] =
-+          //     "GroupID = %6d bsz_idx = %6d seq_len = %6d seq_idx =
-+          //     %6d" "local_id = "
-+          //     "%6d "
-+          //     "token_idx = %6d "
-+          //     "context_len = %6d "
-+          //     "v_cache_stride_head_dim = %6d "
-+          //     "token_position = %6d\n";
-+          // sycl::ext::oneapi::experimental::printf(
-+          //     FMT, gid, bsz_idx, seq_bound, seq_idx, tid,
-+          //     token_idx, context_len, v_cache_stride_dim,
-+          //     token_position);
-+
-+          const scalar_t* query_head = (const scalar_t*)query +
-+                                       token_idx * query_stride_bs +
-+                                       head_idx * query_stride_head;
-+          // Target output
-+          scalar_t* out_head =
-+              (scalar_t*)out +
-+              (query_loc[bsz_idx] + seq_idx) * out_stride_tokens +
-+              head_idx * out_stride_head;
-+
-+          int32_t context_groups = context_len / GS;
-+
-+          // Each token load its query_row
-+          simd<scalar_t, HD> query_row =
-+              block_load<scalar_t, HD>(query_head) * scale;
-+          simd<scalar_t, HD> accv = 0;
-+          simd<scalar_t, GS> softmaxv = 0;
-+          scalar_t max_attn = -sycl::detail::max_v<scalar_t>();
-+
-+          // ################# Handle n * GS context part ######################
-+          int32_t n = context_len / GS;
-+          int32_t context_offset = context_len % GS;
-+
-+          for (int32_t group = 0; group < n; ++group) {
-+            size_t target_key_position = group * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            for (int i = 0; i < HD / x; i++) {
-+              // Load 8 elements, decided by x
-+              simd<scalar_t, 8> key_row =
-+                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
-+              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
-+                                  8 * i * sizeof(scalar_t),
-+                              key_row);
-+            }
+ # R-4B
+ def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
+     assert modality == "image"
+@@ -1662,6 +1755,7 @@ model_example_map = {
+     "aya_vision": run_aya_vision,
+     "blip-2": run_blip2,
+     "chameleon": run_chameleon,
++    "dots_ocr": run_dots_ocr,
+     "command_a_vision": run_command_a_vision,
+     "deepseek_vl_v2": run_deepseek_vl2,
+     "ernie45_vl": run_ernie45_vl,
+@@ -1707,6 +1801,8 @@ model_example_map = {
+     "qwen2_vl": run_qwen2_vl,
+     "qwen2_5_vl": run_qwen2_5_vl,
+     "qwen2_5_omni": run_qwen2_5_omni,
++    "qwen3_vl": run_qwen3_vl,
++    "qwen3_vl_moe": run_qwen3_vl_moe,
+     "rvl": run_r_vl,
+     "skywork_chat": run_skyworkr1v,
+     "smolvlm": run_smolvlm,
+@@ -1716,6 +1812,15 @@ model_example_map = {
+ }
+ 
+ 
++MODELS_NEED_VIDEO_METADATA = [
++    "glm4_1v",
++    "glm4_5v",
++    "glm4_5v_fp8",
++    "qwen3_vl",
++    "qwen3_vl_moe",
++]
 +
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot;
-+            for (int i = 0; i < HD; i++) {
-+              scalar_t temp_value = value_head[i * v_cache_stride_dim];
-+              slm_scalar_store<scalar_t>(value_slm_offset +
-+                                             tid * HD * sizeof(scalar_t) +
-+                                             i * sizeof(scalar_t),
-+                                         temp_value);
-+            }
-+            barrier();
-+
-+            // Calculate QK^T for this group...
-+            simd<scalar_t, GS> attnv;
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              attnv[r] = attn;
-+            }
-+            scalar_t new_max_attn =
-+                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
-+            scalar_t attn_exp = exp(max_attn - new_max_attn);
-+            accv = accv * attn_exp;
-+            softmaxv = softmaxv * attn_exp;
-+            max_attn = new_max_attn;
-+            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
-+#pragma unorll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              accv += value_row * attn_expv[r];
-+            }
-+            softmaxv += attn_expv;
-+            barrier();
-+          }
-+
-+          // ########## End for handling context n * GS part ###########
-+
-+          // ########## Handle n * GS ################
-+          for (size_t group = 0; group < gid; ++group) {
-+            // 1. begins to load each position's key and value
-+            size_t target_key_position = context_len + group * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            for (int i = 0; i < HD / x; i++) {
-+              // Load 8 elements
-+              simd<scalar_t, 8> key_row =
-+                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
-+              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
-+                                  8 * i * sizeof(scalar_t),
-+                              key_row);
-+            }
 +
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot;
-+            for (int i = 0; i < HD; i++) {
-+              scalar_t temp_value = value_head[i * v_cache_stride_dim];
-+              slm_scalar_store<scalar_t>(value_slm_offset +
-+                                             tid * HD * sizeof(scalar_t) +
-+                                             i * sizeof(scalar_t),
-+                                         temp_value);
-+            }
-+            barrier();
-+            simd<scalar_t, GS> attnv;
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              attnv[r] = attn;
-+            }
+ def get_multi_modal_input(args):
+     """
+     return {
+diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
+index 01c2905cf..2649c992b 100644
+--- a/examples/offline_inference/vision_language_multi_image.py
++++ b/examples/offline_inference/vision_language_multi_image.py
+@@ -982,12 +982,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+         )
+         smart_resize = None
+ 
+-    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
++    model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+ 
+     engine_args = EngineArgs(
+         model=model_name,
+         max_model_len=32768 if smart_resize is None else 4096,
+-        max_num_seqs=5,
++        max_num_seqs=2,
++        enforce_eager=True,
++        gpu_memory_utilization=0.8,
+         limit_mm_per_prompt={"image": len(image_urls)},
+     )
+ 
+diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
+index 2a8f46372..990b47f22 100644
+--- a/examples/online_serving/structured_outputs/structured_outputs.py
++++ b/examples/online_serving/structured_outputs/structured_outputs.py
+@@ -225,7 +225,7 @@ async def cli():
+     )
+     args = parser.parse_args()
+ 
+-    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
++    base_url = os.getenv("OPENAI_BASE_URL", "http://0.0.0.0:8000/v1")
+     client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+     constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+     model = (await client.models.list()).data[0].id
+@@ -236,6 +236,7 @@ async def cli():
+                 client.chat.completions.create(
+                     model=model,
+                     max_tokens=1024,
++                    temperature=0,
+                     stream=True,
+                     **PARAMS[name],
+                 )
+@@ -250,6 +251,7 @@ async def cli():
+                 client.chat.completions.create(
+                     model=model,
+                     max_tokens=1024,
++                    temperature=0,
+                     stream=False,
+                     **PARAMS[name],
+                 )
+diff --git a/requirements/common.txt b/requirements/common.txt
+index b8665104b..a52745f69 100644
+--- a/requirements/common.txt
++++ b/requirements/common.txt
+@@ -24,7 +24,7 @@ outlines_core == 0.2.11
+ # required for outlines backend disk cache
+ diskcache == 5.6.3
+ lark == 1.2.2
+-xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
++xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+ typing_extensions >= 4.10
+ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
+ partial-json-parser # used for parsing partial JSON outputs
+diff --git a/requirements/xpu.txt b/requirements/xpu.txt
+index 74f5b05b2..c0203a754 100644
+--- a/requirements/xpu.txt
++++ b/requirements/xpu.txt
+@@ -11,9 +11,10 @@ jinja2>=3.1.6
+ datasets # for benchmark scripts
+ numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+ nixl==0.3.0 # for PD disaggregation
 +
-+            scalar_t new_max_attn =
-+                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
-+            scalar_t attn_exp = exp(max_attn - new_max_attn);
-+            accv = accv * attn_exp;
-+
-+            softmaxv = softmaxv * attn_exp;
-+            max_attn = new_max_attn;
-+            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
-+#pragma unroll
-+            for (size_t r = 0; r < GS; ++r) {
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              accv += value_row * attn_expv[r];
-+            }
-+            softmaxv += attn_expv;
-+            barrier();
-+          }
-+
-+          // ######### End of handle n * GS part ##########
-+
-+          // ################ Handle offset part ####################
-+          scalar_t softmax =
-+              sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, GS>(
-+                  softmaxv);
-+
-+          // ########### handle context offset ############
-+          if (tid < context_offset) {
-+            size_t target_key_position = n * GS + tid;
-+            int which_block = target_key_position / block_size;
-+            int which_slot = target_key_position % block_size;
-+
-+            int physical_block_number = block_table[which_block];
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+            for (int i = 0; i < HD / x; i++) {
-+              // Load 8 elements
-+              simd<scalar_t, 8> key_row =
-+                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
-+              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
-+                                  8 * i * sizeof(scalar_t),
-+                              key_row);
-+            }
+ torch==2.8.0+xpu
+ torchaudio
+ torchvision
+ --extra-index-url=https://download.pytorch.org/whl/xpu
+ 
+-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
++intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
+diff --git a/setup.py b/setup.py
+index 67f65d9b9..eb313b7d2 100644
+--- a/setup.py
++++ b/setup.py
+@@ -56,6 +56,8 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
+     # fallback to cpu
+     VLLM_TARGET_DEVICE = "cpu"
+ 
++MAIN_CUDA_VERSION = "12.8"
 +
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot;
-+            for (int i = 0; i < HD; i++) {
-+              // Seems to have an error here
-+              scalar_t temp_value = value_head[i * v_cache_stride_dim];
-+              slm_scalar_store<scalar_t>(value_slm_offset +
-+                                             tid * HD * sizeof(scalar_t) +
-+                                             i * sizeof(scalar_t),
-+                                         temp_value);
-+            }
-+          }
-+
-+          barrier();
-+
-+          if (token_position < seq_bound) {
-+#pragma unroll
-+            for (size_t r = 0; r < context_offset; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              if (attn <= max_attn) {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(attn - max_attn);
-+                accv += value_row * attn_exp;
-+                softmax += attn_exp;
-+              } else {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(max_attn - attn);
-+                accv = accv * attn_exp + value_row;
-+                softmax = softmax * attn_exp + 1;
-+                max_attn = attn;
-+              }
-+            }
-+          }
-+          barrier();
-+
-+          // ############## handle seq offset #################
-+          if (token_position < seq_bound) {
-+            const int64_t which_block =
-+                static_cast<int64_t>(token_position / block_size);
-+            const int64_t which_slot =
-+                static_cast<int64_t>(token_position % block_size);
-+
-+            const int64_t physical_block_number =
-+                static_cast<int64_t>(block_table[which_block]);
-+
-+            const scalar_t* key_head =
-+                (const scalar_t*)key +
-+                physical_block_number * k_cache_stride_tokens +
-+                kv_head_idx * k_cache_stride_head +
-+                which_slot * k_cache_stride_block_size;
-+
-+            for (int i = 0; i < HD / x; i++) {
-+              // Load 8 elements
-+              simd<scalar_t, 8> key_row =
-+                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
-+              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
-+                                  8 * i * sizeof(scalar_t),
-+                              key_row);
-+            }
+ 
+ def is_sccache_available() -> bool:
+     return which("sccache") is not None and \
+@@ -505,7 +507,7 @@ def get_vllm_version() -> str:
+             version += f"{sep}precompiled"
+         else:
+             cuda_version = str(get_nvcc_cuda_version())
+-            if cuda_version != envs.VLLM_MAIN_CUDA_VERSION:
++            if cuda_version != MAIN_CUDA_VERSION:
+                 cuda_version_str = cuda_version.replace(".", "")[:3]
+                 # skip this for source tarball, required for pypi
+                 if "sdist" not in sys.argv:
+@@ -513,7 +515,7 @@ def get_vllm_version() -> str:
+     elif _is_hip():
+         # Get the Rocm Version
+         rocm_version = get_rocm_version() or torch.version.hip
+-        if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
++        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+             version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+     elif _is_tpu():
+         version += f"{sep}tpu"
+diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
+index 29a3b40d2..72819f31d 100644
+--- a/tests/entrypoints/openai/test_vision.py
++++ b/tests/entrypoints/openai/test_vision.py
+@@ -34,11 +34,11 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
+     ],
+     [
+         "The image shows a Venn diagram with three over",
+-        "The image shows a Venn diagram with three intersect",
++        "This image shows a Venn diagram with three over",
+     ],
+     [
+         "This image displays a gradient of colors ranging from",
+-        "The image displays a gradient of colors ranging from",
++        "This image displays a gradient of colors forming a spectrum",
+     ],
+ ]
+ 
+diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
+index c01ea3299..d37b968ed 100644
+--- a/tests/kernels/attention/test_mha_attn.py
++++ b/tests/kernels/attention/test_mha_attn.py
+@@ -36,31 +36,52 @@ def test_mha_attn_platform(device: str):
+     torch.set_default_dtype(torch.float16)
+ 
+     if device == "cpu":
+-        with patch("vllm.attention.selector.current_platform",
+-                   CpuPlatform()), \
+-             patch("vllm.platforms.current_platform", CpuPlatform()):
++        with patch("vllm.attention.layer.current_platform", CpuPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CpuPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+-            assert attn.attn_backend == _Backend.TORCH_SDPA_VLLM_V1
++            assert attn.attn_backend == _Backend.TORCH_SDPA
+     elif device == "hip":
+-        with patch("vllm.attention.selector.current_platform",
+-                   RocmPlatform()), \
+-             patch("vllm.platforms.current_platform", RocmPlatform()), \
+-             patch("vllm.attention.layer.current_platform", RocmPlatform()):
++        with patch("vllm.attention.layer.current_platform", RocmPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   RocmPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+             assert attn.attn_backend == _Backend.TORCH_SDPA
+     else:
+-        with patch("vllm.attention.selector.current_platform",
+-                   CudaPlatform()), \
+-             patch("vllm.platforms.current_platform", CudaPlatform()):
++        # Test CUDA with head_size=64 (divisible by 32)
++        # - should use vLLM's FlashAttention
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CudaPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+-            assert attn.attn_backend == _Backend.XFORMERS
++            assert attn.attn_backend == _Backend.FLASH_ATTN
+ 
+-        with patch("vllm.attention.selector.current_platform",
++        # Test CUDA with head_size=72 (not divisible by 32)
++        # - with upstream FA not available
++        # - should use xformers
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
+                    CudaPlatform()), \
+-             patch("vllm.platforms.current_platform", CudaPlatform()):
++             patch("vllm.attention.layer.check_upstream_fa_availability",
++                   return_value=False):
+             attn = MultiHeadAttention(16, 72, scale=1)
+             assert attn.attn_backend == _Backend.XFORMERS
+ 
++        # Test CUDA with head_size=72 (not divisible by 32)
++        # - with upstream FA available
++        # - should use upstream FA
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CudaPlatform()), \
++             patch("vllm.attention.layer.check_upstream_fa_availability",
++                   return_value=True), \
++             patch.dict('sys.modules', {'flash_attn': type('MockFlashAttn', (),
++                                                           {
++                 'flash_attn_varlen_func': lambda *args, **kwargs: None
++             })()}):
++            attn = MultiHeadAttention(16, 72, scale=1)
++            assert attn.attn_backend == _Backend.FLASH_ATTN
++
+ 
+ def ref_attention(
+     query: torch.Tensor,
+diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
+index ced0ab337..404854f54 100644
+--- a/tests/models/multimodal/processing/test_common.py
++++ b/tests/models/multimodal/processing/test_common.py
+@@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+     """
+     # Ensure video metadata is included
+     if "video" in mm_data:
++        # GLM4.1V doesn't support multiple videos
+         video = mm_data["video"]
+         mm_data["video"] = (video, {
+             "total_num_frames": len(video),
+@@ -41,6 +42,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+     return mm_data
+ 
+ 
++def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
++    """
++    Patch the multimodal data for Qwen3-VL model.
++    """
 +
-+            // [num_blocks, num_kv_heads, head_size, block_size]
-+            const scalar_t* value_head =
-+                (const scalar_t*)value +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head + which_slot;
-+            for (int i = 0; i < HD; i++) {
-+              scalar_t temp_value = value_head[i * v_cache_stride_dim];
-+              slm_scalar_store<scalar_t>(value_slm_offset +
-+                                             tid * HD * sizeof(scalar_t) +
-+                                             i * sizeof(scalar_t),
-+                                         temp_value);
-+            }
-+          }
-+          barrier();
-+
-+          if (token_position < seq_bound) {
-+            for (size_t r = 0; r <= tid; ++r) {
-+              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
-+                  key_slm_offset + r * HD * sizeof(scalar_t));
-+              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
-+                  value_slm_offset + r * HD * sizeof(scalar_t));
-+              scalar_t attn =
-+                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
-+                      query_row * key_row);
-+              if (attn <= max_attn) {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(attn - max_attn);
-+                accv += value_row * attn_exp;
-+                softmax += attn_exp;
-+              } else {
-+                scalar_t attn_exp =
-+                    sycl::ext::intel::esimd::exp(max_attn - attn);
-+                accv = accv * attn_exp + value_row;
-+                softmax = softmax * attn_exp + 1;
-+                max_attn = attn;
-+              }
-+            }
++    def create_metadata(frames: np.ndarray):
++        num_frames = len(frames)
++        return {
++            "total_num_frames": num_frames,
++            "fps": 2.0,
++            "duration": num_frames / 2.0,
++            "video_backend": "opencv",
++            "frames_indices": list(range(num_frames)),
++            "do_sample_frames": True,
++        }
 +
-+            if (softmax > 0) {
-+              simd<scalar_t, HD> result = accv / softmax;
-+              block_store(out_head, result);
-+            } else {
-+              simd<scalar_t, HD> result = 0;
-+              block_store(out_head, result);
-+            }
-+          }
-+          // ######## Ending of handling seq offset ##########
-+        });
-+  };
-+  queue.submit(cgf);
-+}
++    # Ensure video metadata is included
++    if "video" in mm_data:
++        video = mm_data["video"]
++        if isinstance(video, list):
++            # multiple videos
++            mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
++        else:
++            # single video
++            mm_data["video"] = (video, create_metadata(video))
++    return mm_data
 +
-+template <typename T, int GS, int HD>
-+void context_attention_kernel_v2(
-+    void* query, void* key, void* value, const void* block_tables,
-+    const float scale, const void* query_start_loc, const void* seq_lens,
-+    const void* context_lens, const int block_size,
-+    const int x,  // x in kv_cache
-+    void* out,    // output
-+    const int block_table_stride_batch, const int block_table_stride_seq,
-+    const int query_stride_bs, const int query_stride_head,
-+    const int query_stride_dim, const int k_cache_stride_tokens,
-+    const int k_cache_stride_head, const int k_cache_stride_dim,
-+    const int k_cache_stride_block_size, const int k_cache_stride_x,
-+    const int v_cache_stride_tokens, const int v_cache_stride_head,
-+    const int v_cache_stride_dim, const int v_cache_stride_block_size,
-+    const int out_stride_tokens, const int out_stride_head,
-+    const int num_queries_per_kv, const int max_input_length,
-+    const int batch_size, const int num_heads, const int num_tokens,
-+    const int max_context_len, const int max_q_len) {
-+  constexpr int BLOCK_SIZE = 8;
-+  constexpr int NUM_THREADS = 128;
-+  // Each wrap handles one context block, therefore, each thread_group_size is
-+  // this.
-+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-+  // Each query, and key thread_group loads 16 bytes
-+  // Assume TGS=4 then 16 / 4 / sizeof(half) = 2
-+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
-+  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
-+  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
-+
-+  // Assuming HD = 128, TGS = 2, then 128 / 2 / 2 = 32
-+  int num_vecs_per_thread = HD / THREAD_GROUP_SIZE / VEC_SIZE;
-+  sycl_t* out_p = reinterpret_cast<sycl_t*>(out);
-+  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query);
-+  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key);
-+  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value);
-+  const int* query_loc_ptr = reinterpret_cast<const int*>(query_start_loc);
-+  const int* block_tables_ptr = reinterpret_cast<const int*>(block_tables);
-+  const int* context_lens_ptr = reinterpret_cast<const int*>(context_lens);
-+  const int* seq_lens_ptr = reinterpret_cast<const int*>(seq_lens);
-+
-+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+  int padded_max_context_len =
-+      DIVIDE_ROUND_UP(max_context_len + 1 + max_q_len, BLOCK_SIZE) * BLOCK_SIZE;
-+  int logits_size = padded_max_context_len * sizeof(float);
-+  int outputs_size = (NUM_WARPS / 2) * HD * sizeof(float);
-+  // Python-side check in
-+  // vllm.worker.worker._check_if_can_support_max_seq_len Keep that in
-+  // sync with the logic here!
-+  int shared_mem_size = std::max(logits_size, outputs_size);
-+  // WARN: we have changed this...
-+  sycl::range<3> grid(batch_size, num_heads, max_q_len);
-+  // One work-group that is executing on the device
-+  sycl::range<3> block(1, 1, NUM_THREADS);
-+  sycl::queue& queue = vllm::xpu::vllmGetQueue();
-+
-+  auto cgf = [&](sycl::handler& handle) {
-+    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
-+        sycl::range<1>(shared_mem_size), handle);
-+    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(
-+        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), handle);
-+    sycl::local_accessor<float, 1> red_smem_acc_ct1(
-+        sycl::range<1>(2 * NUM_WARPS), handle);
-+
-+    handle.parallel_for(
-+        sycl::nd_range<3>(grid * block, block),
-+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-+          const int bsz_idx = item_ct1.get_group(0);
-+          const int seq_idx = item_ct1.get_group(2);
-+          constexpr bool USE_PARTITIONING = false;
-+          int context_len = context_lens_ptr[bsz_idx] + seq_idx;
-+          const int seq_len = seq_lens_ptr[bsz_idx];
-+          uint8_t* dpct_local = dpct_local_acc_ct1.get_pointer();
-+          Q_Vec* q_vecs = q_vecs_acc_ct1.get_pointer();
-+          float* red_smem = red_smem_acc_ct1.get_pointer();
-+
-+          // output_stream << "Original context_len: " <<
-+          // context_lens_ptr[bsz_idx] << sycl::endl; output_stream <<
-+          // "Batch_idx: " << bsz_idx << " Seq_idx: " << seq_idx
-+          //     << " Context_len: " << context_len << " Original context_len: "
-+          //     << context_lens_ptr[bsz_idx] << " Seq_len: " << seq_len
-+          //     << " Max input length: " << max_input_length
-+          //     << sycl::endl;
-+          if (context_len >= seq_len) {
-+            return;
-+          }
-+
-+          context_len = context_len + 1;
-+
-+          const int num_context_blocks =
-+              DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-+          const int num_blocks_per_partition = num_context_blocks;
-+
-+          const int start_block_idx = 0;
-+          const int end_block_idx =
-+              MIN(start_block_idx + num_context_blocks, num_context_blocks);
-+
-+          const int num_blocks = end_block_idx - start_block_idx;
-+          const int start_token_idx = start_block_idx * BLOCK_SIZE;
-+          const int end_token_idx =
-+              MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
-+          const int num_tokens = end_token_idx - start_token_idx;
-+          constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-+          constexpr int NUM_THREAD_GROUPS =
-+              NUM_THREADS /
-+              THREAD_GROUP_SIZE;  // Note: This assumes THREAD_GROUP_SIZE
-+          constexpr int NUM_TOKENS_PER_THREAD_GROUP =
-+              DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
-+          constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+          const int thread_idx = item_ct1.get_local_id(2);
-+          const int warp_idx = thread_idx / WARP_SIZE;
-+          const int lane = thread_idx % WARP_SIZE;
-+          const int head_idx = item_ct1.get_group(1);
-+          const int num_heads = item_ct1.get_group_range(1);
-+          const int kv_head_idx = head_idx / num_queries_per_kv;
-+          // TODO: consider alibi_slope later
-+          constexpr int NUM_ELEMS_PER_THREAD = HD / THREAD_GROUP_SIZE;
-+          constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
-+          const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
-+          const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
-+          const sycl_t* q_ptr =
-+              query_ptr + (query_loc_ptr[bsz_idx] + seq_idx) * query_stride_bs +
-+              head_idx * HD;
-+
-+#pragma unroll
-+          for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
-+               i += NUM_THREAD_GROUPS) {
-+            const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
-+            q_vecs[thread_group_offset * NUM_VECS_PER_THREAD + i] =
-+                *reinterpret_cast<const Q_Vec*>(q_ptr + vec_idx * VEC_SIZE);
-+          }
-+          // Loaded q_vecs
-+          item_ct1.barrier(sycl::access::fence_space::local_space);
-+          auto shared_mem = (char*)dpct_local;
-+          float* logits = reinterpret_cast<float*>(shared_mem);
-+          constexpr int x = 16 / sizeof(sycl_t);
-+          float qk_max = -FLT_MAX;
-+          const int* block_table =
-+              block_tables_ptr + bsz_idx * block_table_stride_batch;
-+
-+          // Loading key
-+          for (int block_idx = start_block_idx + warp_idx;
-+               block_idx < end_block_idx; block_idx += NUM_WARPS) {
-+            const int64_t physical_block_number =
-+                static_cast<int64_t>(block_table[block_idx]);
-+            for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
-+              const int physical_block_offset =
-+                  (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
-+              const int token_idx =
-+                  block_idx * BLOCK_SIZE + physical_block_offset;
-+
-+              Q_Vec k_vecs[NUM_VECS_PER_THREAD];
-+
-+#pragma unroll
-+              for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
-+                const sycl_t* k_ptr =
-+                    key_cache_ptr +
-+                    physical_block_number * k_cache_stride_tokens +
-+                    kv_head_idx * k_cache_stride_head +
-+                    physical_block_offset * x;
-+
-+                const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
-+                const int offset1 = (vec_idx * VEC_SIZE) / x;
-+                const int offset2 = (vec_idx * VEC_SIZE) % x;
-+                k_vecs[j] = *reinterpret_cast<const Q_Vec*>(
-+                    k_ptr + offset1 * BLOCK_SIZE * x + offset2);
-+              }
-+
-+              // Compute dot product.
-+              // This includes a reduction across the threads in the
-+              // same thread group. Q_Vec_t
-+              // q_vec_[NUM_VECS_PER_THREAD] = q_vecs +
-+              // thread_group_offset * THREAD_GROUP_SIZE;
-+              float qk = scale *
-+                         Qk_dot<sycl_t, THREAD_GROUP_SIZE>::template dot<
-+                             Q_Vec, NUM_VECS_PER_THREAD>(
-+                             q_vecs + thread_group_offset * NUM_VECS_PER_THREAD,
-+                             k_vecs, item_ct1);
-+
-+              if (thread_group_offset == 0) {
-+                // Store the partial reductions to shared memory.
-+                // NOTE(woosuk): It is required to zero out the
-+                // masked logits.
-+                const bool mask = token_idx > context_len;
-+                logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-+                qk_max = mask ? qk_max : sycl::fmax(qk_max, qk);
-+              }
-+            }
-+          }
-+#pragma unroll
-+          for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-+            /*
-+            DPCT1096:38: The right-most dimension of the work-group used
-+            in the SYCL kernel that calls this function may be less than
-+            "32". The function "dpct::permute_sub_group_by_xor" may
-+            return an unexpected result on the CPU device. Modify the
-+            size of the work-group to ensure that the value of the
-+            right-most dimension is a multiple of "32".
-+            */
-+            qk_max =
-+                sycl::fmax(qk_max, dpct::permute_sub_group_by_xor(
-+                                       item_ct1.get_sub_group(), qk_max, mask));
-+          }
-+          if (lane == 0) {
-+            red_smem[warp_idx] = qk_max;
-+          }
-+          item_ct1.barrier(sycl::access::fence_space::local_space);
-+          // TODO(woosuk): Refactor this part.
-+          // Get the max qk value for the sequence.
-+          qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
-+#pragma unroll
-+          for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-+            /*
-+            DPCT1096:39: The right-most dimension of the work-group used
-+            in the SYCL kernel that calls this function may be less than
-+            "32". The function "dpct::permute_sub_group_by_xor" may
-+            return an unexpected result on the CPU device. Modify the
-+            size of the work-group to ensure that the value of the
-+            right-most dimension is a multiple of "32".
-+            */
-+            qk_max =
-+                sycl::fmax(qk_max, dpct::permute_sub_group_by_xor(
-+                                       item_ct1.get_sub_group(), qk_max, mask));
-+          }
-+          qk_max =
-+              dpct::select_from_sub_group(item_ct1.get_sub_group(), qk_max, 0);
-+
-+          // Get the sum of the exp values.
-+          float exp_sum = 0.f;
-+          for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-+            float val = sycl::exp(logits[i] - qk_max);
-+            logits[i] = val;
-+            exp_sum += val;
-+          }
-+          exp_sum =
-+              block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum, item_ct1);
-+          // Compute softmax.
-+          const float inv_sum = 1.f / (exp_sum + 1e-6f);
-+#pragma unroll
-+          for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-+            logits[i] *= inv_sum;
-+          }
-+
-+          item_ct1.barrier(sycl::access::fence_space::local_space);
-+          constexpr int V_VEC_SIZE = MIN(16 / sizeof(sycl_t), BLOCK_SIZE);
-+          using V_vec = typename Vec<sycl_t, V_VEC_SIZE>::Type;
-+          using L_vec = typename Vec<sycl_t, V_VEC_SIZE>::Type;
-+          using Float_L_vec = typename FloatVec<L_vec>::Type;
-+          constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
-+          constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
-+          constexpr int NUM_ROWS_PER_THREAD =
-+              DIVIDE_ROUND_UP(HD, NUM_ROWS_PER_ITER);
-+          // NOTE(woosuk): We use FP32 for the accumulator for better
-+          // accuracy.
-+          float accs[NUM_ROWS_PER_THREAD];
-+#pragma unroll
-+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+            accs[i] = 0.f;
-+          }
-+
-+          sycl_t zero_value;
-+          zero(zero_value);
-+          for (int block_idx = start_block_idx + warp_idx;
-+               block_idx < end_block_idx; block_idx += NUM_WARPS) {
-+            // NOTE(woosuk): The block number is stored in int32.
-+            // However, we cast it to int64 because int32 can lead to
-+            // overflow when this variable is multiplied by large
-+            // numbers (e.g., kv_block_stride).
-+            const int64_t physical_block_number =
-+                static_cast<int64_t>(block_table[block_idx]);
-+            const int physical_block_offset =
-+                (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
-+            const int token_idx =
-+                block_idx * BLOCK_SIZE + physical_block_offset;
-+            L_vec logits_vec;
-+            vllm::from_float(
-+                logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx -
-+                                                            start_token_idx));
-+
-+            const sycl_t* v_ptr =
-+                value_cache_ptr +
-+                physical_block_number * v_cache_stride_tokens +
-+                kv_head_idx * v_cache_stride_head;
-+#pragma unroll
-+            for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+              const int row_idx =
-+                  lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+              if (row_idx < HD) {
-+                const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
-+                V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
-+                if (block_idx == num_context_blocks - 1) {
-+                  // NOTE(woosuk): When v_vec contains the tokens
-+                  // that are out of the context, we should
-+                  // explicitly zero out the values since they may
-+                  // contain NaNs. See
-+                  // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
-+                  sycl_t* v_vec_ptr = reinterpret_cast<sycl_t*>(&v_vec);
-+#pragma unroll
-+                  for (int j = 0; j < V_VEC_SIZE; j++) {
-+                    v_vec_ptr[j] =
-+                        token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
-+                  }
-+                }
-+                accs[i] += vllm::dot(logits_vec, v_vec);
-+              }
-+            }
-+          }
-+      // Perform reduction within each warp.
-+#pragma unroll
-+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+            float acc = accs[i];
-+#pragma unroll
-+            for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-+              /*
-+              DPCT1096:41: The right-most dimension of the work-group
-+              used in the SYCL kernel that calls this function may be
-+              less than "32". The function
-+              "dpct::permute_sub_group_by_xor" may return an
-+              unexpected result on the CPU device. Modify the size of
-+              the work-group to ensure that the value of the
-+              right-most dimension is a multiple of "32".
-+              */
-+              acc += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(),
-+                                                    acc, mask);
-+            }
-+            accs[i] = acc;
-+          }
-+
-+          // NOTE(woosuk): A barrier is required because the shared memory
-+          // space for logits is reused for the output.
-+
-+          item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+          // Perform reduction across warps.
-+          float* out_smem = reinterpret_cast<float*>(shared_mem);
-+#pragma unroll
-+          for (int i = NUM_WARPS; i > 1; i /= 2) {
-+            int mid = i / 2;
-+            // Upper warps write to shared memory.
-+            if (warp_idx >= mid && warp_idx < i) {
-+              float* dst = &out_smem[(warp_idx - mid) * HD];
-+#pragma unroll
-+              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+                const int row_idx =
-+                    lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+                if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
-+                  dst[row_idx] = accs[i];
-+                }
-+              }
-+            }
 +
-+            item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+            // Lower warps update the output.
-+            if (warp_idx < mid) {
-+              const float* src = &out_smem[warp_idx * HD];
-+#pragma unroll
-+              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+                const int row_idx =
-+                    lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+                if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
-+                  accs[i] += src[row_idx];
-+                }
-+              }
-+            }
-+
-+            item_ct1.barrier(sycl::access::fence_space::local_space);
-+          }
-+
-+          // Write the final output.
-+          if (warp_idx == 0) {
-+            sycl_t* out_ptr =
-+                out_p + (query_loc_ptr[bsz_idx] + seq_idx) * out_stride_tokens +
-+                head_idx * out_stride_head;
-+
-+#pragma unroll
-+            for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+              const int row_idx =
-+                  lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+              if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
-+                vllm::from_float(*(out_ptr + row_idx), accs[i]);
-+              }
-+            }
-+          }
-+        });
-+    // Each thread_group handles one token
-+  };
-+  queue.submit(cgf);
-+}
+ def _test_processing_correctness(
+     model_id_or_arch: str,
+     hit_rate: float,
+@@ -181,8 +210,10 @@ _IGNORE_MM_KEYS = {
+ }
+ 
+ MM_DATA_PATCHES = {
+-    # GLM4.1V requires video metadata to be included in the input
++    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+     "glm4v": glm4_1v_patch_mm_data,
++    "qwen3_vl": qwen3_vl_patch_mm_data,
++    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
+ }
+ 
+ 
+@@ -328,6 +359,8 @@ def _test_processing_correctness_one(
+     "Qwen/Qwen2.5-VL-3B-Instruct",
+     "Qwen/Qwen2-Audio-7B-Instruct",
+     "Qwen/Qwen2.5-Omni-3B",
++    "Qwen/Qwen3-VL-4B-Instruct",
++    "Qwen/Qwen3-VL-30B-A3B-Instruct",
+     "YannQi/R-4B",
+     "Skywork/Skywork-R1V-38B",
+     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+diff --git a/tests/models/registry.py b/tests/models/registry.py
+index 0c77ec5ef..696aee3cc 100644
+--- a/tests/models/registry.py
++++ b/tests/models/registry.py
+@@ -449,6 +449,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
+                                                 max_transformers_version="4.48",  # noqa: E501
+                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
+                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
++    "DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr",
++                                          trust_remote_code=True),
+     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+     "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                               trust_remote_code=True),
+@@ -559,6 +561,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
+                                                           max_model_len=4096),
+     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
++    "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
++                                                        max_model_len=4096,
++                                                        min_transformers_version="4.57"),  # noqa: E501
++    "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
++                                                        max_model_len=4096,
++                                                        min_transformers_version="4.57"),
+     "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
+                                                  trust_remote_code=True),
+     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
+diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
+index 08d9573ec..82a0e0cd8 100644
+--- a/tests/quantization/test_cpu_offload.py
++++ b/tests/quantization/test_cpu_offload.py
+@@ -1,4 +1,4 @@
+-# SPDX-License-Identifier: Apache-2.0
++# SPDX-License-Identifier: Apache-2.0
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+ 
+ # Expanded quantized model tests for CPU offloading
+@@ -11,6 +11,16 @@ from tests.quantization.utils import is_quant_method_supported
+ from ..utils import compare_two_settings
+ 
+ 
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="fp8 is not supported on this GPU type.")
++def test_offload_weights_before_quant_fp8():
++    # Test quantization of an unquantized checkpoint
++    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
++                         ["--quantization", "fp8"], ["--quantization", "fp8"],
++                         {"VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT": "1"},
++                         max_wait_seconds=480)
++
++
+ @pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                     reason="fp8 is not supported on this GPU type.")
+ def test_cpu_offload_fp8():
+diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
+index 34b1b6c2e..4c8082646 100644
+--- a/tests/quantization/test_ipex_quant.py
++++ b/tests/quantization/test_ipex_quant.py
+@@ -25,7 +25,7 @@ DTYPE = ["bfloat16"]
+ @pytest.mark.parametrize("model", MODELS)
+ @pytest.mark.parametrize("dtype", DTYPE)
+ def test_ipex_quant(vllm_runner, model, dtype):
+-    with vllm_runner(model, dtype=dtype) as llm:
++    with vllm_runner(model, dtype=dtype, enforce_eager=True, block_size=64) as llm:
+         output = llm.generate_greedy(["The capital of France is"],
+                                      max_tokens=32)
+     assert output
+diff --git a/tests/utils.py b/tests/utils.py
+index 16e1e6039..514da44f4 100644
+--- a/tests/utils.py
++++ b/tests/utils.py
+@@ -1140,6 +1140,8 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
+             print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
+ 
+         return attn_backend_list
++    elif current_platform.is_xpu():
++        return ["FLASH_ATTN"]
+     else:
+         raise ValueError("Unsupported platform")
+ 
+diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
+index 4dfe1d3bb..56f102253 100644
+--- a/tests/v1/e2e/test_correctness_sliding_window.py
++++ b/tests/v1/e2e/test_correctness_sliding_window.py
+@@ -18,7 +18,7 @@ class TestConfig:
+ 
+ model_config = {
+     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+-    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
++    #"google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
+ }
+ 
+ 
+@@ -26,7 +26,7 @@ model_config = {
+     "model",
+     [
+         "bigcode/starcoder2-3b",  # sliding window only
+-        "google/gemma-3-1b-it",  # sliding window + full attention
++        #"google/gemma-3-1b-it",  # sliding window + full attention
+     ])
+ @pytest.mark.parametrize("batch_size", [5])
+ @pytest.mark.parametrize("seed", [1])
+@@ -46,7 +46,9 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed,
+ 
+         llm = LLM(
+             model=model,
+-            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager)
++            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
++            enforce_eager=True,
++            block_size=64)
+         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+ 
+         prompts, answer, indices = prep_prompts(batch_size,
+diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
+index 0b240b7d4..1ebd4fde4 100644
+--- a/tests/v1/e2e/test_spec_decode.py
++++ b/tests/v1/e2e/test_spec_decode.py
+@@ -90,7 +90,7 @@ def test_ngram_correctness(
+         m.setenv("VLLM_USE_V1", "1")
+         test_prompts = get_test_prompts(mm_enabled=False)
+ 
+-        ref_llm = LLM(model=model_name, max_model_len=1024)
++        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True, block_size=64, dtype="float16")
+         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+         del ref_llm
+         torch.cuda.empty_cache()
+@@ -105,6 +105,10 @@ def test_ngram_correctness(
+                 "num_speculative_tokens": 3,
+             },
+             max_model_len=1024,
++            enforce_eager=True,
++            block_size=64,
++            dtype="float16",
++            gpu_memory_utilization=0.6,
+         )
+         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+         matches = 0
+@@ -125,30 +129,22 @@ def test_ngram_correctness(
+         cleanup_dist_env_and_memory()
+ 
+ 
+-@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+-    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+-    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+-    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+-    pytest.param(
+-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+-        False,
+-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+-    pytest.param(
+-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+-        True,
+-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+-    (("eagle", "eagle618/deepseek-v3-random",
+-      "eagle618/eagle-deepseek-v3-random", 1), False),
+-],
+-                         ids=[
+-                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+-                             "llama4_eagle", "llama4_eagle_mm",
+-                             "deepseek_eagle"
+-                         ])
++@pytest.mark.parametrize(
++    ["model_setup", "mm_enabled"],
++    [
++        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
++        # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
++        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
++          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
++        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
++          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
++    ],
++    ids=[
++        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
++        # "qwen3_eagle3",
++        "llama3_eagle",
++        "llama3_eagle3",
++    ])
+ @pytest.mark.parametrize("attn_backend",
+                          get_attn_backend_list_based_on_platform())
+ def test_eagle_correctness(
+@@ -188,7 +184,12 @@ def test_eagle_correctness(
+ 
+         ref_llm = LLM(model=model_name,
+                       max_model_len=2048,
+-                      tensor_parallel_size=tp_size)
++                      tensor_parallel_size=tp_size,
++                      enforce_eager=True,
++                      block_size=64,
++                      dtype="float16",
++                      gpu_memory_utilization=0.6,
++                      )
+         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+         del ref_llm
+         torch.cuda.empty_cache()
+@@ -204,6 +205,10 @@ def test_eagle_correctness(
+                 "num_speculative_tokens": 3,
+                 "max_model_len": 2048,
+             },
++            enforce_eager=True,
++            block_size=64,
++            dtype="float16",
++            gpu_memory_utilization=0.6,
+             max_model_len=2048,
+         )
+         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+diff --git a/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
+new file mode 100644
+index 000000000..ae4909b29
+--- /dev/null
++++ b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
+@@ -0,0 +1,156 @@
++#!/bin/bash
++set -e
 +
-+template <
-+    typename scalar_t,
-+    typename Q_Vec_t,
-+    int HEAD_SIZE,
-+    int BLOCK_SIZE,
-+    int NUM_THREADS,
-+    int VEC_SIZE,
-+    int PARTITION_SIZE = 0> // Zero means no partitioning.
-+void paged_attention_kernel(
-+    float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
-+    float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
-+    scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
-+                                // head_size]
-+    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
-+    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size/x, block_size, x]
-+    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size, block_size]
-+    const int num_kv_heads, // [num_heads]
-+    const float scale,
-+    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
-+    const int* __restrict__ context_lens, // [num_seqs]
-+    const int max_num_blocks_per_seq,
-+    const float* __restrict__ alibi_slopes, // [num_heads]
-+    const int q_stride,
-+    const int kv_block_stride,
-+    const int kv_head_stride,
-+    const float attn_logit_softcapping,
-+    const sycl::nd_item<3>& item_ct1,
-+    uint8_t* dpct_local,
-+    Q_Vec_t* q_vecs,
-+    float* red_smem) {
-+  const int seq_idx = item_ct1.get_group(1);
-+  const int partition_idx = item_ct1.get_group(0);
-+  const int max_num_partitions = item_ct1.get_group_range(0);
-+  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
-+  const int context_len = context_lens[seq_idx];
-+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
-+    // No work to do. Terminate the thread block.
-+    return;
-+  }
-+
-+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-+  const int num_blocks_per_partition =
-+      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
-+
-+  // [start_block_idx, end_block_idx) is the range of blocks to process.
-+  const int start_block_idx =
-+      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
-+  const int end_block_idx =
-+      MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
-+  const int num_blocks = end_block_idx - start_block_idx;
-+
-+  // [start_token_idx, end_token_idx) is the range of tokens to process.
-+  const int start_token_idx = start_block_idx * BLOCK_SIZE;
-+  const int end_token_idx =
-+      MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
-+  const int num_tokens = end_token_idx - start_token_idx;
-+
-+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-+  constexpr int NUM_THREAD_GROUPS =
-+      NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
-+                                       // divides NUM_THREADS
-+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
-+  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
-+      DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
-+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+  const int thread_idx = item_ct1.get_local_id(2);
-+  const int warp_idx = thread_idx / WARP_SIZE;
-+  const int lane = thread_idx % WARP_SIZE;
-+
-+  const int head_idx = item_ct1.get_group(2);
-+  const int num_heads = item_ct1.get_group_range(2);
-+  const int num_queries_per_kv = num_heads / num_kv_heads;
-+
-+  const int kv_head_idx = head_idx / num_queries_per_kv;
-+  ;
-+  const float alibi_slope =
-+      alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
-+
-+  // A vector type to store a part of a key or a query.
-+  // The vector size is configured in such a way that the threads in a thread
-+  // group fetch or compute 16 bytes at a time. For example, if the size of a
-+  // thread group is 4 and the data type is half, then the vector size is 16 /
-+  // (4 * sizeof(half)) == 2.
-+
-+  // constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)),
-+  // 1);
-+
-+  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
-+  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
-+
-+  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
-+  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
-+
-+  // Load the query to registers.
-+  // Each thread in a thread group has a different part of the query.
-+  // For example, if the the thread group size is 4, then the first thread in
-+  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
-+  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
-+  // q is split from a qkv tensor, it may not be contiguous.
-+  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-+
-+#pragma unroll
-+  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
-+       i += NUM_THREAD_GROUPS) {
-+    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
-+    q_vecs[thread_group_offset * NUM_VECS_PER_THREAD + i] =
-+        *reinterpret_cast<const Q_Vec_t*>(q_ptr + vec_idx * VEC_SIZE);
-+  }
-+  /*
-+  DPCT1065:5: Consider replacing sycl::nd_item::barrier() with
-+  sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-+  performance if there is no access to global memory.
-+  */
-+  item_ct1.barrier(sycl::access::fence_space::local_space); // TODO(naed90): possible speedup if this is replaced with
-+                      // a memory wall right before we use q_vecs
-+
-+  // Memory planning.
-+  auto shared_mem = (char*)dpct_local;
-+  // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy.
-+  float* logits = reinterpret_cast<float*>(shared_mem);
-+  // Workspace for reduction.
-+
-+  // x == THREAD_GROUP_SIZE * VEC_SIZE
-+  // Each thread group fetches x elements from the key at a time.
-+  constexpr int x = 16 / sizeof(scalar_t);
-+  float qk_max = -FLT_MAX;
-+
-+  // Iterate over the key blocks.
-+  // Each warp fetches a block of keys for each iteration.
-+  // Each thread group in a warp fetches a key from the block, and computes
-+  // dot product with the query.
-+  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
-+
-+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
-+       block_idx += NUM_WARPS) {
-+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
-+    // int64 because int32 can lead to overflow when this variable is multiplied
-+    // by large numbers (e.g., kv_block_stride).
-+    const int64_t physical_block_number =
-+        static_cast<int64_t>(block_table[block_idx]);
-+
-+    // Load a key to registers.
-+    // Each thread in a thread group has a different part of the key.
-+    // For example, if the the thread group size is 4, then the first thread in
-+    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
-+    // has 1, 5, 9, ... th vectors of the key, and so on.
-+
-+    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
-+      const int physical_block_offset =
-+          (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
-+      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
-+
-+      Q_Vec_t k_vecs[NUM_VECS_PER_THREAD];
-+
-+#pragma unroll
-+      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
-+        const scalar_t* k_ptr = k_cache +
-+            physical_block_number * kv_block_stride +
-+            kv_head_idx * kv_head_stride + physical_block_offset * x;
-+
-+        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
-+        const int offset1 = (vec_idx * VEC_SIZE) / x;
-+        const int offset2 = (vec_idx * VEC_SIZE) % x;
-+        k_vecs[j] = *reinterpret_cast<const Q_Vec_t*>(
-+            k_ptr + offset1 * BLOCK_SIZE * x + offset2);
-+      }
-+
-+      // Compute dot product.
-+      // This includes a reduction across the threads in the same thread group.
-+      // Q_Vec_t q_vec_[NUM_VECS_PER_THREAD] = q_vecs + thread_group_offset *
-+      // THREAD_GROUP_SIZE;
-+      float qk = scale *
-+          Qk_dot<scalar_t, THREAD_GROUP_SIZE>::
-+              template dot<Q_Vec_t, NUM_VECS_PER_THREAD>(
-+                     q_vecs + thread_group_offset * NUM_VECS_PER_THREAD,
-+                     k_vecs,
-+                     item_ct1);
-+      // Add the ALiBi bias if slopes are given.
-+      qk +=
-+          (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
-+
-+      // Add the attn_logit_softcapp if given.
-+      if (attn_logit_softcapping != 0.0) {
-+          qk = attn_softcapping(qk, attn_logit_softcapping);
-+      }
-+      if (thread_group_offset == 0) {
-+        // Store the partial reductions to shared memory.
-+        // NOTE(woosuk): It is required to zero out the masked logits.
-+        const bool mask = token_idx >= context_len;
-+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-+        // Update the max value.
-+        qk_max = mask ? qk_max : sycl::fmax(qk_max, qk);
-+      }
-+    }
-+  }
++# Hosts / ports
++PREFILL_HOST=${PREFILL_HOST:-"localhost"}
++PREFILL_PORT=${PREFILL_PORT:-8100}
++PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
++DECODE_HOST=${DECODE_HOST:-"localhost"}
++DECODE_PORT=${DECODE_PORT:-8200}
++PROXY_HOST=${PROXY_HOST:-"localhost"}
++PROXY_PORT=${PROXY_PORT:-8192}
++BASELINE_HOST=${BASELINE_HOST:-"localhost"}
++BASELINE_PORT=${BASELINE_PORT:-9290}
 +
-+  // Perform reduction across the threads in the same warp to get the
-+  // max qk value for each "warp" (not across the thread block yet).
-+  // The 0-th thread of each thread group already has its max qk value.
-+#pragma unroll
-+  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-+  
-+    /*
-+    DPCT1096:38: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    qk_max = sycl::fmax(
-+        qk_max,
-+        dpct::permute_sub_group_by_xor(
-+            item_ct1.get_sub_group(), qk_max, mask));
-+  }
-+  if (lane == 0) {
-+    red_smem[warp_idx] = qk_max;
-+  }
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
 +
-+  // TODO(woosuk): Refactor this part.
-+  // Get the max qk value for the sequence.
-+  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
-+#pragma unroll
-+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-+    
-+    /*
-+    DPCT1096:39: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    qk_max = sycl::fmax(
-+        qk_max,
-+        dpct::permute_sub_group_by_xor(
-+            item_ct1.get_sub_group(), qk_max, mask));
-+  }
-+  // Broadcast the max qk value to all threads.
-+  
-+  /*
-+  DPCT1096:40: The right-most dimension of the work-group used in the SYCL
-+  kernel that calls this function may be less than "32". The function
-+  "dpct::select_from_sub_group" may return an unexpected result on the CPU
-+  device. Modify the size of the work-group to ensure that the value of the
-+  right-most dimension is a multiple of "32".
-+  */
-+  qk_max = dpct::select_from_sub_group(
-+          item_ct1.get_sub_group(), qk_max, 0);
-+
-+  // Get the sum of the exp values.
-+  float exp_sum = 0.f;
-+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-+    float val = sycl::exp(logits[i] - qk_max);
-+    logits[i] = val;
-+    exp_sum += val;
-+  }
-+  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum, item_ct1);
-+
-+  // Compute softmax.
-+  const float inv_sum = 1.f / (exp_sum + 1e-6f);
-+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-+    logits[i] *= inv_sum;
-+  }
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+  // If partitioning is enabled, store the max logit and exp_sum.
-+  if (USE_PARTITIONING && thread_idx == 0) {
-+    float* max_logits_ptr = max_logits +
-+        seq_idx * num_heads * max_num_partitions +
-+        head_idx * max_num_partitions + partition_idx;
-+    *max_logits_ptr = qk_max;
-+    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
-+        head_idx * max_num_partitions + partition_idx;
-+    *exp_sums_ptr = exp_sum;
-+  }
-+
-+  // Each thread will fetch 16 bytes from the value cache at a time.
-+  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
-+  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
-+  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
-+  using Float_L_vec = typename FloatVec<L_vec>::Type;
-+
-+  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
-+  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
-+  constexpr int NUM_ROWS_PER_THREAD =
-+      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
-+
-+  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
-+  float accs[NUM_ROWS_PER_THREAD];
-+#pragma unroll
-+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+    accs[i] = 0.f;
-+  }
-+
-+  scalar_t zero_value;
-+  zero(zero_value);
-+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
-+       block_idx += NUM_WARPS) {
-+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
-+    // int64 because int32 can lead to overflow when this variable is multiplied
-+    // by large numbers (e.g., kv_block_stride).
-+    const int64_t physical_block_number =
-+        static_cast<int64_t>(block_table[block_idx]);
-+    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
-+    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
-+    L_vec logits_vec;
-+    vllm::from_float(
-+        logits_vec,
-+        *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
-+
-+    const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
-+        kv_head_idx * kv_head_stride;
-+#pragma unroll
-+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+      if (row_idx < HEAD_SIZE) {
-+        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
-+        V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
-+        if (block_idx == num_context_blocks - 1) {
-+          // NOTE(woosuk): When v_vec contains the tokens that are out of the
-+          // context, we should explicitly zero out the values since they may
-+          // contain NaNs. See
-+          // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
-+          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
-+#pragma unroll
-+          for (int j = 0; j < V_VEC_SIZE; j++) {
-+            v_vec_ptr[j] =
-+                token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
-+          }
-+        }
-+        accs[i] += vllm::dot(logits_vec, v_vec);
-+      }
-+    }
-+  }
-+
-+  // Perform reduction within each warp.
-+#pragma unroll
-+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+    float acc = accs[i];
-+#pragma unroll
-+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-+     
-+      /*
-+      DPCT1096:41: The right-most dimension of the work-group used in the SYCL
-+      kernel that calls this function may be less than "32". The function
-+      "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-+      CPU device. Modify the size of the work-group to ensure that the value of
-+      the right-most dimension is a multiple of "32".
-+      */
-+      acc += dpct::permute_sub_group_by_xor(
-+          item_ct1.get_sub_group(), acc, mask);
-+    }
-+    accs[i] = acc;
-+  }
-+
-+  // NOTE(woosuk): A barrier is required because the shared memory space for
-+  // logits is reused for the output.
-+
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+  // Perform reduction across warps.
-+  float* out_smem = reinterpret_cast<float*>(shared_mem);
-+#pragma unroll
-+  for (int i = NUM_WARPS; i > 1; i /= 2) {
-+    int mid = i / 2;
-+    // Upper warps write to shared memory.
-+    if (warp_idx >= mid && warp_idx < i) {
-+      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-+#pragma unroll
-+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-+          dst[row_idx] = accs[i];
-+        }
-+      }
-+    }
-+    
-+    item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+    // Lower warps update the output.
-+    if (warp_idx < mid) {
-+      const float* src = &out_smem[warp_idx * HEAD_SIZE];
-+#pragma unroll
-+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-+          accs[i] += src[row_idx];
-+        }
-+      }
-+    }
-+    
-+    item_ct1.barrier(sycl::access::fence_space::local_space);
-+  }
-+
-+  // Write the final output.
-+  if (warp_idx == 0) {
-+    scalar_t* out_ptr = out +
-+        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-+        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
-+#pragma unroll
-+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-+        vllm::from_float(*(out_ptr + row_idx), accs[i]);
-+      }
-+    }
-+  }
-+}
++# Model to run.
++MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen3-0.6B"}
++MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
++BLOCK_SIZE=${BLOCK_SIZE:-16}
 +
-+// Grid: (num_heads, num_seqs, 1).
-+template <
-+    typename scalar_t,
-+    typename Q_Vec_t,
-+    int HEAD_SIZE,
-+    int BLOCK_SIZE,
-+    int NUM_THREADS,
-+    int VEC_SIZE>
-+void paged_attention_v1_kernel(
-+    scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
-+    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
-+    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size/x, block_size, x]
-+    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size, block_size]
-+    const int num_kv_heads, // [num_heads]
-+    const float scale,
-+    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
-+    const int* __restrict__ context_lens, // [num_seqs]
-+    const int max_num_blocks_per_seq,
-+    const float* __restrict__ alibi_slopes, // [num_heads]
-+    const int q_stride,
-+    const int kv_block_stride,
-+    const int kv_head_stride,
-+    const float attn_logit_softcapping,
-+    const sycl::nd_item<3>& item_ct1,
-+    uint8_t* dpct_local,
-+    Q_Vec_t* q_vecs,
-+    float* red_smem) {
-+  paged_attention_kernel<
-+      scalar_t,
-+      Q_Vec_t,
-+      HEAD_SIZE,
-+      BLOCK_SIZE,
-+      NUM_THREADS,
-+      VEC_SIZE>(
-+      /* exp_sums */ nullptr,
-+      /* max_logits */ nullptr,
-+      out,
-+      q,
-+      k_cache,
-+      v_cache,
-+      num_kv_heads,
-+      scale,
-+      block_tables,
-+      context_lens,
-+      max_num_blocks_per_seq,
-+      alibi_slopes,
-+      q_stride,
-+      kv_block_stride,
-+      kv_head_stride,
-+      attn_logit_softcapping,
-+      item_ct1,
-+      dpct_local,
-+      q_vecs,
-+      red_smem);
-+}
 +
-+#define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)      \
-+  paged_attention_xpu_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
-+      out_ptr,                                                 \
-+      query_ptr,                                               \
-+      key_cache_ptr,                                           \
-+      value_cache_ptr,                                         \
-+      num_kv_heads,                                            \
-+      scale,                                                   \
-+      block_tables_ptr,                                        \
-+      context_lens_ptr,                                        \
-+      max_num_blocks_per_seq,                                  \
-+      alibi_slopes_ptr,                                        \
-+      q_stride,                                                \
-+      kv_block_stride,                                         \
-+      kv_head_stride,                                          \
-+      num_seqs,                                                \
-+      num_heads,                                               \
-+      num_blocks);
-+
-+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
-+  event = queue.submit([&](sycl::handler& cgh) {                            \
-+    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
-+        sycl::range<1>(shared_mem_size), cgh);                              \
-+    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(                          \
-+        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), cgh);      \
-+    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
-+        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
-+                                                                            \
-+    auto out_ptr_ct0 = out_ptr;                                             \
-+    auto query_ptr_ct1 = query_ptr;                                         \
-+    auto key_cache_ptr_ct2 = key_cache_ptr;                                 \
-+    auto value_cache_ptr_ct3 = value_cache_ptr;                             \
-+    auto scale_ct5 = scale;                                                 \
-+    auto block_tables_ptr_ct6 = block_tables_ptr;                           \
-+    auto context_lens_ptr_ct7 = context_lens_ptr;                           \
-+    auto max_num_blocks_per_seq_ct8 = max_num_blocks_per_seq;               \
-+    auto alibi_slopes_ptr_ct9 = alibi_slopes_ptr;                           \
-+    auto q_stride_ct10 = q_stride;                                          \
-+    auto kv_block_stride_ct11 = kv_block_stride;                            \
-+    auto kv_head_stride_ct12 = kv_head_stride;                              \
-+    auto attn_logit_softcapping_ct13 = attn_logit_softcapping;              \
-+                                                                            \
-+    cgh.parallel_for(                                                       \
-+        sycl::nd_range<3>(grid * block, block),                             \
-+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
-+          paged_attention_v1_kernel<                                        \
-+              sycl_t,                                                       \
-+              Q_Vec,                                                        \
-+              HEAD_SIZE,                                                    \
-+              BLOCK_SIZE,                                                   \
-+              NUM_THREADS,                                                  \
-+              VEC_SIZE>(                                                    \
-+              out_ptr_ct0,                                                  \
-+              query_ptr_ct1,                                                \
-+              key_cache_ptr_ct2,                                            \
-+              value_cache_ptr_ct3,                                          \
-+              num_kv_heads,                                                 \
-+              scale_ct5,                                                    \
-+              block_tables_ptr_ct6,                                         \
-+              context_lens_ptr_ct7,                                         \
-+              max_num_blocks_per_seq_ct8,                                   \
-+              alibi_slopes_ptr_ct9,                                         \
-+              q_stride_ct10,                                                \
-+              kv_block_stride_ct11,                                         \
-+              kv_head_stride_ct12,                                          \
-+              attn_logit_softcapping_ct13,                                  \
-+              item_ct1,                                                     \
-+              dpct_local_acc_ct1.get_pointer(),                             \
-+              q_vecs_acc_ct1.get_pointer(),                                 \
-+              red_smem_acc_ct1.get_pointer());                              \
-+        });                                                                 \
-+  });
-+
-+template <typename T, int BLOCK_SIZE, int NUM_THREADS = 512>
-+void paged_attention_xpu_v1_impl_launcher(
-+    torch::Tensor& out,
-+    torch::Tensor& query,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    int num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int max_context_len,
-+    const c10::optional<torch::Tensor>& alibi_slopes,
-+    const float attn_logit_softcapping) {
-+  int num_seqs = query.size(0);
-+  int num_heads = query.size(1);
-+  int head_size = query.size(2);
-+  int max_num_blocks_per_seq = block_tables.size(1);
-+  int q_stride = query.stride(0);
-+  int kv_block_stride = key_cache.stride(0);
-+  int kv_head_stride = key_cache.stride(1);
-+
-+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
-+  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
-+  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
-+
-+  int num_vecs_per_thread = head_size / THREAD_GROUP_SIZE / VEC_SIZE;
-+  assert(head_size % THREAD_GROUP_SIZE == 0);
-+
-+  // NOTE: alibi_slopes is optional.
-+  const float* alibi_slopes_ptr = alibi_slopes
-+      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-+      : nullptr;
-+
-+  sycl_t* out_ptr = reinterpret_cast<sycl_t*>(out.data_ptr());
-+  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query.data_ptr());
-+  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key_cache.data_ptr());
-+  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value_cache.data_ptr());
-+  int* block_tables_ptr = block_tables.data_ptr<int>();
-+  int* context_lens_ptr = context_lens.data_ptr<int>();
-+
-+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+  int padded_max_context_len =
-+      DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
-+  
-+  int logits_size = padded_max_context_len * sizeof(float);
-+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-+  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
-+  // Keep that in sync with the logic here!
-+  int shared_mem_size = std::max(logits_size, outputs_size);
-+
-+  sycl::range<3> grid(1, num_seqs, num_heads);
-+  sycl::range<3> block(1, 1, NUM_THREADS);
-+  sycl::queue& queue = vllm::xpu::vllmGetQueue();
-+  sycl::event event;
-+
-+  switch (head_size) {
-+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-+    // head sizes that we use in the model. However, we can easily extend this
-+    // to support any head size which is a multiple of 16.
-+    case 64:
-+      LAUNCH_PAGED_ATTENTION_V1(64);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    case 80:
-+      LAUNCH_PAGED_ATTENTION_V1(80);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    case 96:
-+      LAUNCH_PAGED_ATTENTION_V1(96);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    case 112:
-+      LAUNCH_PAGED_ATTENTION_V1(112);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    case 128:
-+      LAUNCH_PAGED_ATTENTION_V1(128);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    case 256:
-+      LAUNCH_PAGED_ATTENTION_V1(256);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v1", event);
-+#endif
-+      break;
-+    default:
-+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-+      break;
-+  }
-+  // queue.wait();
-+}
++# execution env
++GIT_ROOT=$(git rev-parse --show-toplevel)
++EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
 +
-+#define CALL_KERNEL_LAUNCHER(T, BLOCK_SIZE)                  \
-+  vllm::paged_attention_xpu_v1_impl_launcher<T, BLOCK_SIZE>( \
-+      out,                                                   \
-+      query,                                                 \
-+      key_cache,                                             \
-+      value_cache,                                           \
-+      num_kv_heads,                                          \
-+      scale,                                                 \
-+      block_tables,                                          \
-+      context_lens,                                          \
-+      max_context_len,                                       \
-+      alibi_slopes,                                          \
-+      attn_logit_softcapping);
-+
-+#define CALL_KERNEL_LAUNCHER_BLOCK_SIZE(T)                        \
-+  switch (block_size) {                                           \
-+    case 8:                                                      \
-+      CALL_KERNEL_LAUNCHER(T, 8);                                \
-+      break;                                                      \
-+    case 16:                                                      \
-+      CALL_KERNEL_LAUNCHER(T, 16);                                \
-+      break;                                                      \
-+    case 32:                                                      \
-+      CALL_KERNEL_LAUNCHER(T, 32);                                \
-+      break;                                                      \
-+    case 64:                                                      \
-+      CALL_KERNEL_LAUNCHER(T, 64);                                \
-+      break;                                                      \
-+    default:                                                      \
-+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-+      break;                                                      \
-+  }
-+
-+// Grid: (num_heads, num_seqs).
-+template <
-+    typename scalar_t,
-+    int HEAD_SIZE,
-+    int NUM_THREADS,
-+    int PARTITION_SIZE>
-+void paged_attention_v2_reduce_kernel(
-+    scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
-+    const float* __restrict__ exp_sums, // [num_seqs, num_heads,
-+                                        // max_num_partitions]
-+    const float* __restrict__ max_logits, // [num_seqs, num_heads,
-+                                          // max_num_partitions]
-+    const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
-+                                          // max_num_partitions, head_size]
-+    const int* __restrict__ context_lens, // [num_seqs]
-+    const int max_num_partitions,
-+    const sycl::nd_item<3>& item_ct1,
-+    uint8_t* dpct_local,
-+    float* red_smem) {
-+  const int num_heads = item_ct1.get_group_range(2);
-+  const int head_idx = item_ct1.get_group(2);
-+  const int seq_idx = item_ct1.get_group(1);
-+  const int context_len = context_lens[seq_idx];
-+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-+  if (num_partitions == 1) {
-+    // No need to reduce. Only copy tmp_out to out.
-+    scalar_t* out_ptr =
-+        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-+    const scalar_t* tmp_out_ptr = tmp_out +
-+        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-+        head_idx * max_num_partitions * HEAD_SIZE;
-+    for (int i = item_ct1.get_local_id(2); i < HEAD_SIZE;
-+         i += item_ct1.get_local_range(2)) {
-+      out_ptr[i] = tmp_out_ptr[i];
-+    }
-+    // Terminate the thread block.
-+    return;
-+  }
-+
-+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+  const int warp_idx = item_ct1.get_local_id(2) / WARP_SIZE;
-+  const int lane = item_ct1.get_local_id(2) % WARP_SIZE;
-+
-+  // Size: 2 * num_partitions.
-+  auto shared_mem = (char*)dpct_local;
-+  // Workspace for reduction.
-+
-+  // Load max logits to shared memory.
-+  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
-+  const float* max_logits_ptr = max_logits +
-+      seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions;
-+  float max_logit = -FLT_MAX;
-+  for (int i = item_ct1.get_local_id(2); i < num_partitions;
-+       i += item_ct1.get_local_range(2)) {
-+    const float l = max_logits_ptr[i];
-+    shared_max_logits[i] = l;
-+    max_logit = sycl::fmax(max_logit, (float)l);
-+  }
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
++OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.xpu_accuracy_test_outputs.txt"}
 +
-+  // Get the global max logit.
-+  // Reduce within the warp.
-+#pragma unroll
-+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-+    
-+    /*
-+    DPCT1096:45: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    max_logit = sycl::fmax(
-+        max_logit,
-+        dpct::permute_sub_group_by_xor(
-+            item_ct1.get_sub_group(), max_logit, mask));
-+  }
-+  if (lane == 0) {
-+    red_smem[warp_idx] = max_logit;
-+  }
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+  // Reduce across warps.
-+  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
-+#pragma unroll
-+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-+    
-+    /*
-+    DPCT1096:46: The right-most dimension of the work-group used in the SYCL
-+    kernel that calls this function may be less than "32". The function
-+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
-+    device. Modify the size of the work-group to ensure that the value of the
-+    right-most dimension is a multiple of "32".
-+    */
-+    max_logit = sycl::fmax(
-+        max_logit,
-+        dpct::permute_sub_group_by_xor(
-+            item_ct1.get_sub_group(), max_logit, mask));
-+  }
-+  // Broadcast the max value to all threads.
-+  
-+  /*
-+  DPCT1096:47: The right-most dimension of the work-group used in the SYCL
-+  kernel that calls this function may be less than "32". The function
-+  "dpct::select_from_sub_group" may return an unexpected result on the CPU
-+  device. Modify the size of the work-group to ensure that the value of the
-+  right-most dimension is a multiple of "32".
-+  */
-+  max_logit = dpct::select_from_sub_group(
-+      item_ct1.get_sub_group(), max_logit, 0);
-+
-+  // Load rescaled exp sums to shared memory.
-+  float* shared_exp_sums =
-+      reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
-+  const float* exp_sums_ptr = exp_sums +
-+      seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions;
-+  float global_exp_sum = 0.0f;
-+  for (int i = item_ct1.get_local_id(2); i < num_partitions;
-+       i += item_ct1.get_local_range(2)) {
-+    float l = shared_max_logits[i];
-+    float rescaled_exp_sum = exp_sums_ptr[i] * sycl::exp(l - max_logit);
-+    global_exp_sum += rescaled_exp_sum;
-+    shared_exp_sums[i] = rescaled_exp_sum;
-+  }
-+  
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+  global_exp_sum =
-+      block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum, item_ct1);
-+  const float inv_global_exp_sum = 1.0f / (global_exp_sum + 1e-6f);
-+
-+  // Aggregate tmp_out to out.
-+  const scalar_t* tmp_out_ptr = tmp_out +
-+      seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-+      head_idx * max_num_partitions * HEAD_SIZE;
-+  scalar_t* out_ptr =
-+      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-+#pragma unroll
-+  for (int i = item_ct1.get_local_id(2); i < HEAD_SIZE; i += NUM_THREADS) {
-+    float acc = 0.0f;
-+    for (int j = 0; j < num_partitions; ++j) {
-+      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
-+          inv_global_exp_sum;
-+    }
-+    from_float(out_ptr[i], acc);
-+  }
-+}
++# Trap the SIGINT signal (triggered by Ctrl+C)
++trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
 +
-+// Grid: (num_heads, num_seqs, max_num_partitions).
-+template <
-+    typename scalar_t,
-+    typename Q_Vec_t,
-+    int HEAD_SIZE,
-+    int BLOCK_SIZE,
-+    int NUM_THREADS,
-+    int VEC_SIZE,
-+    int PARTITION_SIZE>
-+void paged_attention_v2_kernel(
-+    float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
-+    float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
-+    scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions,
-+                                    // head_size]
-+    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
-+    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size/x, block_size, x]
-+    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
-+                                          // head_size, block_size]
-+    const int num_kv_heads, // [num_heads]
-+    const float scale,
-+    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
-+    const int* __restrict__ context_lens, // [num_seqs]
-+    const int max_num_blocks_per_seq,
-+    const float* __restrict__ alibi_slopes, // [num_heads]
-+    const int q_stride,
-+    const int kv_block_stride,
-+    const int kv_head_stride,
-+    const float attn_logit_softcapping,
-+    const sycl::nd_item<3>& item_ct1,
-+    uint8_t* dpct_local,
-+    Q_Vec_t* q_vecs,
-+    float* red_smem) {
-+  paged_attention_kernel<
-+      scalar_t,
-+      Q_Vec_t,
-+      HEAD_SIZE,
-+      BLOCK_SIZE,
-+      NUM_THREADS,
-+      VEC_SIZE,
-+      PARTITION_SIZE>(
-+      exp_sums,
-+      max_logits,
-+      tmp_out,
-+      q,
-+      k_cache,
-+      v_cache,
-+      num_kv_heads,
-+      scale,
-+      block_tables,
-+      context_lens,
-+      max_num_blocks_per_seq,
-+      alibi_slopes,
-+      q_stride,
-+      kv_block_stride,
-+      kv_head_stride,
-+      attn_logit_softcapping,
-+      item_ct1,
-+      dpct_local,
-+      q_vecs,
-+      red_smem);
++cleanup() {
++  echo "Cleaning up any running vLLM instances..."
++  pkill -f "vllm serve" || true
++  sleep 2
 +}
 +
-+#define LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(HEAD_SIZE)                     \
-+  event = queue.submit([&](sycl::handler& cgh) {                            \
-+    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
-+        sycl::range<1>(shared_mem_size), cgh);                              \
-+    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(                          \
-+        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), cgh);      \
-+    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
-+        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
-+                                                                            \
-+    auto exp_sums_ptr_ct0 = exp_sums_ptr;                                   \
-+    auto max_logits_ptr_ct1 = max_logits_ptr;                               \
-+    auto tmp_out_ptr_ct2 = tmp_out_ptr;                                     \
-+    auto query_ptr_ct3 = query_ptr;                                         \
-+    auto key_cache_ptr_ct4 = key_cache_ptr;                                 \
-+    auto value_cache_ptr_ct5 = value_cache_ptr;                             \
-+    auto scale_ct7 = scale;                                                 \
-+    auto block_tables_ptr_ct8 = block_tables_ptr;                           \
-+    auto context_lens_ptr_ct9 = context_lens_ptr;                           \
-+    auto max_num_blocks_per_seq_ct10 = max_num_blocks_per_seq;              \
-+    auto alibi_slopes_ptr_ct11 = alibi_slopes_ptr;                          \
-+    auto q_stride_ct12 = q_stride;                                          \
-+    auto kv_block_stride_ct13 = kv_block_stride;                            \
-+    auto kv_head_stride_ct14 = kv_head_stride;                              \
-+    auto attn_logit_softcapping_ct15 = attn_logit_softcapping;              \
-+                                                                            \
-+    cgh.parallel_for(                                                       \
-+        sycl::nd_range<3>(grid * block, block),                             \
-+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
-+          vllm::paged_attention_v2_kernel<                                  \
-+              sycl_t,                                                       \
-+              Q_Vec,                                                        \
-+              HEAD_SIZE,                                                    \
-+              BLOCK_SIZE,                                                   \
-+              NUM_THREADS,                                                  \
-+              VEC_SIZE,                                                     \
-+              PARTITION_SIZE>(                                              \
-+              exp_sums_ptr_ct0,                                             \
-+              max_logits_ptr_ct1,                                           \
-+              tmp_out_ptr_ct2,                                              \
-+              query_ptr_ct3,                                                \
-+              key_cache_ptr_ct4,                                            \
-+              value_cache_ptr_ct5,                                          \
-+              num_kv_heads,                                                 \
-+              scale_ct7,                                                    \
-+              block_tables_ptr_ct8,                                         \
-+              context_lens_ptr_ct9,                                         \
-+              max_num_blocks_per_seq_ct10,                                  \
-+              alibi_slopes_ptr_ct11,                                        \
-+              q_stride_ct12,                                                \
-+              kv_block_stride_ct13,                                         \
-+              kv_head_stride_ct14,                                          \
-+              attn_logit_softcapping_ct15,                                  \
-+              item_ct1,                                                     \
-+              dpct_local_acc_ct1.get_pointer(),                             \
-+              q_vecs_acc_ct1.get_pointer(),                                 \
-+              red_smem_acc_ct1.get_pointer());                              \
-+        });                                                                 \
-+  });
-+
-+#define LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(HEAD_SIZE)                    \
-+  event2 = queue.submit([&](sycl::handler& cgh) {                           \
-+    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
-+        sycl::range<1>(reduce_shared_mem_size), cgh);                       \
-+    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
-+        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
-+                                                                            \
-+    auto out_ptr_ct0 = out_ptr;                                             \
-+    auto exp_sums_ptr_ct1 = exp_sums_ptr;                                   \
-+    auto max_logits_ptr_ct2 = max_logits_ptr;                               \
-+    auto tmp_out_ptr_ct3 = tmp_out_ptr;                                     \
-+    auto context_lens_ptr_ct4 = context_lens_ptr;                           \
-+    auto max_num_partitions_ct5 = max_num_partitions;                       \
-+                                                                            \
-+    cgh.parallel_for(                                                       \
-+        sycl::nd_range<3>(reduce_grid * block, block),                      \
-+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
-+          vllm::paged_attention_v2_reduce_kernel<                           \
-+              sycl_t,                                                       \
-+              HEAD_SIZE,                                                    \
-+              NUM_THREADS,                                                  \
-+              PARTITION_SIZE>(                                              \
-+              out_ptr_ct0,                                                  \
-+              exp_sums_ptr_ct1,                                             \
-+              max_logits_ptr_ct2,                                           \
-+              tmp_out_ptr_ct3,                                              \
-+              context_lens_ptr_ct4,                                         \
-+              max_num_partitions_ct5,                                       \
-+              item_ct1,                                                     \
-+              dpct_local_acc_ct1.get_pointer(),                             \
-+              red_smem_acc_ct1.get_pointer());                              \
-+        });                                                                 \
-+  });
-+
-+template <
-+    typename T,
-+    int BLOCK_SIZE,
-+    int NUM_THREADS = 512,
-+    int PARTITION_SIZE = 512>
-+void paged_attention_v2_launcher(
-+    torch::Tensor& out,
-+    torch::Tensor& exp_sums,
-+    torch::Tensor& max_logits,
-+    torch::Tensor& tmp_out,
-+    torch::Tensor& query,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    int num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int max_context_len,
-+    const c10::optional<torch::Tensor>& alibi_slopes,
-+    const float attn_logit_softcapping) {
-+  int num_seqs = query.size(0);
-+  int num_heads = query.size(1);
-+  int head_size = query.size(2);
-+  int max_num_blocks_per_seq = block_tables.size(1);
-+  int q_stride = query.stride(0);
-+  int kv_block_stride = key_cache.stride(0);
-+  int kv_head_stride = key_cache.stride(1);
-+
-+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-+  assert(head_size % THREAD_GROUP_SIZE == 0);
-+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
-+  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
-+  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
-+
-+  int num_vecs_per_thread = head_size / THREAD_GROUP_SIZE / VEC_SIZE;
-+  assert(head_size % THREAD_GROUP_SIZE == 0);
-+
-+  // NOTE: alibi_slopes is optional.
-+  const float* alibi_slopes_ptr = alibi_slopes
-+      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-+      : nullptr;
-+
-+  sycl_t* out_ptr = reinterpret_cast<sycl_t*>(out.data_ptr());
-+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-+  sycl_t* tmp_out_ptr = reinterpret_cast<sycl_t*>(tmp_out.data_ptr());
-+  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query.data_ptr());
-+  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key_cache.data_ptr());
-+  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value_cache.data_ptr());
-+  int* block_tables_ptr = block_tables.data_ptr<int>();
-+  int* context_lens_ptr = context_lens.data_ptr<int>();
-+
-+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-+  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
-+  
-+  int logits_size = PARTITION_SIZE * sizeof(float);
-+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-+
-+  // For paged attention v2 kernel.
-+  sycl::range<3> grid(max_num_partitions, num_seqs, num_heads);
-+  int shared_mem_size = std::max(logits_size, outputs_size);
-+  // For paged attention v2 reduce kernel.
-+  sycl::range<3> reduce_grid(1, num_seqs, num_heads);
-+  
-+  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
-+
-+  sycl::range<3> block(1, 1, NUM_THREADS);
-+  sycl::queue& queue = vllm::xpu::vllmGetQueue();
-+  sycl::event event;
-+  sycl::event event2;
-+  switch (head_size) {
-+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-+    // head sizes that we use in the model. However, we can easily extend this
-+    // to support any head size which is a multiple of 16.
-+    case 64:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(64);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(64);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    case 80:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(80);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(80);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    case 96:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(96);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(96);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    case 112:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(112);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(112);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    case 128:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(128);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(128);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    case 256:
-+      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(256);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event);
-+#endif
-+      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(256);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
-+#else
-+    ::xpu::profiler_record("paged attn v2", event2);
-+#endif
-+      break;
-+    default:
-+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-+      break;
-+  }
++wait_for_server() {
++  local host=$1
++  local port=$2
++  timeout 1200 bash -c "
++    until curl -s ${host}:${port}/v1/completions > /dev/null; do
++      sleep 1
++    done" && return 0 || return 1
 +}
 +
-+#define CALL_V2_LAUNCHER(T, BLOCK_SIZE)             \
-+  vllm::paged_attention_v2_launcher<T, BLOCK_SIZE>( \
-+      out,                                          \
-+      exp_sums,                                     \
-+      max_logits,                                   \
-+      tmp_out,                                      \
-+      query,                                        \
-+      key_cache,                                    \
-+      value_cache,                                  \
-+      num_kv_heads,                                 \
-+      scale,                                        \
-+      block_tables,                                 \
-+      context_lens,                                 \
-+      max_context_len,                              \
-+      alibi_slopes,                                 \
-+      attn_logit_softcapping);
-+
-+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T)                            \
-+  switch (block_size) {                                           \
-+    case 8:                                                       \
-+      CALL_V2_LAUNCHER(T, 8);                                     \
-+      break;                                                      \
-+    case 16:                                                      \
-+      CALL_V2_LAUNCHER(T, 16);                                    \
-+      break;                                                      \
-+    case 32:                                                      \
-+      CALL_V2_LAUNCHER(T, 32);                                    \
-+      break;                                                      \
-+    case 64:                                                      \
-+      CALL_V2_LAUNCHER(T, 64);                                    \
-+      break;                                                      \
-+    default:                                                      \
-+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-+      break;                                                      \
-+  }
-+
-+} // namespace vllm
-+
-+void paged_attention_v1(
-+    torch::Tensor& out,
-+    torch::Tensor& query,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    int num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int block_size,
-+    int max_context_len,
-+    const c10::optional<torch::Tensor>& alibi_slopes,
-+    const std::string& kv_cache_dtype,
-+    const float kv_scale,
-+    const float attn_logit_softcapping) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(
-+      query.scalar_type(), "paged_attention_xpu_v1_impl", [&] {
-+        CALL_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-+      });
++launch_baseline() {
++  BASELINE_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:0 \
++  VLLM_USE_V1=1 \
++  VLLM_WORKER_MULTIPROC_METHOD=spawn \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
++      --host ${BASELINE_HOST} \
++      --port ${BASELINE_PORT} \
++      --max-model-len ${MAX_MODEL_LEN}\
++      --seed 42 \
++      -tp 1 \
++      --block-size ${BLOCK_SIZE} \
++      --gpu-memory-utilization 0.8 \
++      --disable-log-requests \
++      --dtype float16 \
++      --enforce-eager"
++  echo ${BASELINE_BASE_CMD}      
++  bash -c "${BASELINE_BASE_CMD}" &
++  sleep 10
++  wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
 +}
 +
-+void paged_attention_v2(
-+    torch::Tensor& out,
-+    torch::Tensor& exp_sums,
-+    torch::Tensor& max_logits,
-+    torch::Tensor& tmp_out,
-+    torch::Tensor& query,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    int num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int block_size,
-+    int max_context_len,
-+    const c10::optional<torch::Tensor>& alibi_slopes,
-+    const std::string& kv_cache_dtype,
-+    const float kv_scale,
-+    const float attn_logit_softcapping) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(
-+      query.scalar_type(), "paged_attention_xpu_v2_impl", [&] {
-+        CALL_V2_LAUNCHER_BLOCK_SIZE(scalar_t);
-+      });
-+}
++launch_pd() {
++  PREFILL_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:0 \
++  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
++  VLLM_USE_V1=1 \
++  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
++  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
++  VLLM_WORKER_MULTIPROC_METHOD=spawn \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
++      --host ${PREFILL_HOST} \
++      --port ${PREFILL_PORT} \
++      --max-model-len ${MAX_MODEL_LEN}\
++      --seed 42 \
++      --block-size ${BLOCK_SIZE} \
++      --enforce-eager \
++      --dtype float16 \
++      -tp 1 \
++      --gpu-memory-utilization 0.8 \
++      --disable-log-requests \
++      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 +
-+torch::Tensor context_attention_forward_v2(
-+    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
-+    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor block_tables, torch::Tensor query_start_loc,
-+    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
-+    int max_context_length, int max_q_length) {
-+  // Currently, only support fp16 here
-+  int64_t num_tokens = query.size(0);
-+  int64_t num_heads = query.size(1);
-+  int64_t head_dim = query.size(2);
-+  int64_t batch_size = seq_lens.size(0);
-+  int num_kv_heads = value.size(1);
-+
-+  int key_dimension = key.dim();
-+  auto output = at::empty({query.size(0), query.size(1), query.size(2)},
-+                          at::device(query.device()).dtype(query.dtype()));
-+
-+  assert(key_dimension == 5);
-+  assert(query.scalar_type() == key.scalar_type() &&
-+         query.scalar_type() == value.scalar_type());
-+  assert(head_dim == 128);
-+  assert(query.scalar_type() == at::ScalarType::Half);
-+
-+  int query_stride_token = query.stride(0);
-+  int query_stride_head = query.stride(1);
-+  int query_stride_dim = query.stride(2);
-+  const float attn_scale = 1 / std::sqrt((float)head_dim);
-+
-+  assert(num_heads % num_kv_heads == 0);
-+  int num_queries_per_kv = num_heads / num_kv_heads;
-+
-+
-+  // key: num_blocks, num_kv_heads, head_size // x, num_blocks, x)
-+  // value: [num_blocks, num_kv_heads, head_size, block_dim]
-+  int block_size = value.size(3);
-+  // Currently, only block_size 16 is supported...
-+  assert(block_size == 16);
-+  int x = key.size(4);
-+  int block_table_stride_bsz = block_tables.stride(0);
-+  int block_table_stride_seq = block_tables.stride(1);
-+  int k_cache_stride_token = key.stride(0);
-+  int k_cache_stride_head = key.stride(1);
-+  int k_cache_stride_head_dim = key.stride(2);
-+  int k_cache_stride_block = key.stride(3);
-+  int k_cache_stride_x = key.stride(4);
-+
-+  int v_cache_stride_token = value.stride(0);
-+  int v_cache_stride_head = value.stride(1);
-+  int v_cache_stride_head_dim = value.stride(2);
-+  int v_cache_stride_block = value.stride(3);
-+  switch(head_dim) {
-+    case 128:
-+      vllm::context_attention_kernel_v2<sycl::half, 32, 128>(
-+        query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+        query_stride_token, query_stride_head, query_stride_dim,
-+        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+        output.stride(0), output.stride(1), num_queries_per_kv,
-+        max_input_length, batch_size, num_heads, query.size(0),
-+        max_context_length, max_q_length);
-+      break;
-+    case 64:
-+      vllm::context_attention_kernel_v2<sycl::half, 32, 64>(
-+        query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+        query_stride_token, query_stride_head, query_stride_dim,
-+        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+        output.stride(0), output.stride(1), num_queries_per_kv,
-+        max_input_length, batch_size, num_heads, query.size(0),
-+        max_context_length, max_q_length);
-+      break;
-+    case 80:
-+      vllm::context_attention_kernel_v2<sycl::half, 32, 80>(
-+        query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+        query_stride_token, query_stride_head, query_stride_dim,
-+        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+        output.stride(0), output.stride(1), num_queries_per_kv,
-+        max_input_length, batch_size, num_heads, query.size(0),
-+        max_context_length, max_q_length);
-+      break;
-+    case 96:
-+      vllm::context_attention_kernel_v2<sycl::half, 32, 96>(
-+        query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+        query_stride_token, query_stride_head, query_stride_dim,
-+        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+        output.stride(0), output.stride(1), num_queries_per_kv,
-+        max_input_length, batch_size, num_heads, query.size(0),
-+        max_context_length, max_q_length);
-+      break;
-+    default: throw std::runtime_error("unsupported head_dim");
-+  }
-+    return output;
-+}
 +
-+torch::Tensor context_attention_forward_v1(
-+    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
-+    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor block_tables, torch::Tensor query_start_loc,
-+    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
-+    int max_context_length) {
-+  // Currently, only support fp16
-+  int64_t num_tokens = query.size(0);
-+  int64_t num_heads = query.size(1);
-+  int64_t head_dim = query.size(2);
-+  int64_t batch_size = seq_lens.size(0);
-+  int num_kv_heads = value.size(1);
-+
-+  int key_dimension = key.dim();
-+  auto output = at::empty({query.size(0), query.size(1), query.size(2)},
-+                          at::device(query.device()).dtype(query.dtype()));
-+
-+  // key should be in shape:
-+  // 1. [num_blocks, num_heads, block_size, head_dim]
-+  // 2. [num_blocks, num_heads, head_dim / x, block_size, x]
-+  assert(key_dimension == 4 or key_dimension == 5);
-+  assert(query.scalar_type() == key.scalar_type() &&
-+         query.scalar_type() == value.scalar_type());
-+  assert(query.scalar_type() == at::ScalarType::Half);
-+
-+  int query_stride_token = query.stride(0);
-+  int query_stride_head = query.stride(1);
-+  int query_stride_dim = query.stride(2);
-+  const float attn_scale = 1 / std::sqrt((float)head_dim);
-+
-+  assert(num_heads % num_kv_heads == 0);
-+  int num_queries_per_kv = num_heads / num_kv_heads;
-+  int block_table_stride_bsz = block_tables.stride(0);
-+  int block_table_stride_seq = block_tables.stride(1);
-+  if (key_dimension == 4) {
-+    // key/value: num_blocks, num_kv_heads, num_blocks, head_dim)
-+    int block_size = value.size(2);
-+    int k_cache_stride_0 = key.stride(0);
-+    int k_cache_stride_1 = key.stride(1);
-+    int k_cache_stride_2 = key.stride(2);
-+    int k_cache_stride_3 = key.stride(3);
-+
-+    int v_cache_stride_0 = value.stride(0);
-+    int v_cache_stride_1 = value.stride(1);
-+    int v_cache_stride_2 = value.stride(2);
-+    int v_cache_stride_3 = value.stride(3);
-+    switch (head_dim) {
-+      case 128:
-+        vllm::context_attention_kernel_v1_reshaped<sycl::half, 32, 128>(
-+            query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+            seq_lens.data_ptr(), context_lens.data_ptr(), block_size,
-+            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+            query_stride_token, query_stride_head, query_stride_dim,
-+            k_cache_stride_0, k_cache_stride_1, k_cache_stride_2,
-+            k_cache_stride_3, v_cache_stride_0, v_cache_stride_1,
-+            v_cache_stride_2, v_cache_stride_3, output.stride(0),
-+            output.stride(1), num_queries_per_kv, max_input_length, batch_size,
-+            num_heads);
-+        break;
-+      case 64:
-+        vllm::context_attention_kernel_v1_reshaped<sycl::half, 32, 64>(
-+            query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+            seq_lens.data_ptr(), context_lens.data_ptr(), block_size,
-+            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+            query_stride_token, query_stride_head, query_stride_dim,
-+            k_cache_stride_0, k_cache_stride_1, k_cache_stride_2,
-+            k_cache_stride_3, v_cache_stride_0, v_cache_stride_1,
-+            v_cache_stride_2, v_cache_stride_3, output.stride(0),
-+            output.stride(1), num_queries_per_kv, max_input_length, batch_size,
-+            num_heads);
-+        break;
-+      default:
-+        throw std::runtime_error("unsupported head_dim");
-+    }
-+  } else {
-+    int x = key.size(4);
-+    int block_size = value.size(3);
-+    int k_cache_stride_token = key.stride(0);
-+    int k_cache_stride_head = key.stride(1);
-+    int k_cache_stride_head_dim = key.stride(2);
-+    int k_cache_stride_block = key.stride(3);
-+    int k_cache_stride_x = key.stride(4);
-+
-+    int v_cache_stride_token = value.stride(0);
-+    int v_cache_stride_head = value.stride(1);
-+    int v_cache_stride_head_dim = value.stride(2);
-+    int v_cache_stride_block = value.stride(3);
-+    switch (head_dim) {
-+      case 128:
-+        vllm::context_attention_kernel_v1<sycl::half, 32, 128>(
-+            query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+            seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+            query_stride_token, query_stride_head, query_stride_dim,
-+            k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+            k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+            v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+            output.stride(0), output.stride(1), num_queries_per_kv,
-+            max_input_length, batch_size, num_heads);
-+        break;
-+      case 64:
-+        vllm::context_attention_kernel_v1<sycl::half, 32, 64>(
-+            query.data_ptr(), key.data_ptr(), value.data_ptr(),
-+            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
-+            seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
-+            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
-+            query_stride_token, query_stride_head, query_stride_dim,
-+            k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
-+            k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
-+            v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
-+            output.stride(0), output.stride(1), num_queries_per_kv,
-+            max_input_length, batch_size, num_heads);
-+        break;
-+      default:
-+        throw std::runtime_error("unsupported head_dim");
-+    }
-+  }
-+  return output;
-+}
++  DECODE_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:1 \
++  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
++  VLLM_USE_V1=1 \
++  VLLM_WORKER_MULTIPROC_METHOD=spawn \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
++      --host ${DECODE_HOST} \
++      --port ${DECODE_PORT} \
++      --max-model-len ${MAX_MODEL_LEN}\
++      --seed 42 \
++      --block-size ${BLOCK_SIZE} \
++      --enforce-eager \
++      -tp 1 \
++      --dtype float16 \
++      --gpu-memory-utilization 0.8 \
++      --disable-log-requests \
++      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 +
-+template<typename IT, const int VS, const int HD>
-+void gqa_1_kernel(
-+    const void * query, // [num_seqs, num_heads, head_size]
-+    const void * key,   // [num_blocks, num_kv_heads, head_size, block_size]
-+    const void * value, // [num_blocks, num_kv_heads, head_size, block_size]
-+    const void* block_tables, // [num_seqs, max_num_blocks_per_seq]
-+    const void* context_lens, // [num_seqs]
-+    void * o_a_s,
-+    void * o_accs,
-+    const int64_t query_bsz_stride,
-+    const int64_t query_head_stride,
-+    const int64_t kv_token_stride,
-+    const int64_t kv_head_stride,
-+    const int64_t kv_block_stride,
-+    const int64_t block_table_stride_batch,
-+    const int64_t o_a_s_bsz_stride,
-+    const int64_t o_a_s_head_stride,
-+    const int64_t o_accs_bsz_stride,
-+    const int64_t o_accs_head_stride,
-+    const float scale,
-+    const int block_size,
-+    const int bsz,
-+    const int num_heads,
-+    const int num_kv_heads,
-+    const int block_num,
-+    const at::Device & device
-+) {
-+    const int group_size = num_heads / num_kv_heads;
-+    const int sub_rows = VS / group_size;
-+    const int rem_rows = VS % group_size;
-+
-+    const float attn_scale = scale;
-+
-+    sycl::range<3> global_size(bsz, num_heads, block_num);
-+    sycl::range<3> local_size(1, group_size, 1);
-+
-+    auto cgf = [&](sycl::handler& handle) {
-+        handle.parallel_for(
-+            sycl::nd_range<3>(global_size, local_size),
-+            [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+                slm_init<VS * HD * sizeof(IT)>();
-+
-+                const int bsz_idx = item.get_global_id(0);
-+                const int head_idx = item.get_global_id(1);
-+                const int kv_head_idx = item.get_group(1);
-+                const int tid = item.get_local_id(1);
-+                const int vid = item.get_global_id(2);
-+
-+                const IT * query_head = (const IT *)query + bsz_idx * query_bsz_stride
-+                                                          + head_idx * query_head_stride;
-+                
-+                IT * o_accs_head = (IT *)o_accs + bsz_idx * o_accs_bsz_stride
-+                                                + head_idx * o_accs_head_stride;
-+                float * o_a_s_head = (float *)o_a_s + bsz_idx * o_a_s_bsz_stride
-+                                                    + head_idx * o_a_s_head_stride;
-+
-+                const int* block_tables_ptr = (const int*)block_tables;
-+                const int* block_table =
-+                    block_tables_ptr + bsz_idx * block_table_stride_batch;
-+
-+                const int* context_lens_ptr = (const int*)context_lens;
-+                const int context_length = context_lens_ptr[bsz_idx];
-+
-+                simd<IT, HD> query_row = block_load<IT, HD>(query_head) * attn_scale;
-+
-+                // copy k_cache to slm
-+                int start_row = std::min(vid * VS + tid * sub_rows + std::min(tid, rem_rows), context_length);
-+                int end_row = std::min(start_row + sub_rows + (tid < rem_rows), context_length);
-+                for (int r = start_row; r < end_row; ++r) {
-+                    int which_block = r / block_size;
-+                    int which_slot = r % block_size;
-+                    int physical_block_number = block_table[which_block];
-+
-+                    const IT * key_head = (const IT *)key + physical_block_number * kv_token_stride +
-+                      kv_head_idx * kv_head_stride +
-+                      which_slot * kv_block_stride;
-+
-+                    simd<IT, HD> key_row = block_load<IT, HD>(key_head);
-+                    slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), key_row);
-+                }
-+                barrier();
-+
-+                simd<float, VS> attns = -sycl::detail::max_v<float>();
-+                int row_num = (vid + 1) * VS > context_length ? context_length % VS : VS;
-+                // q @ k
-+                for (int r = 0; r < row_num; ++r) {
-+                    simd<IT, HD> key_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
-+                    float attn = sycl::ext::intel::esimd::detail::sum<float, IT, HD>(query_row * key_row);
-+                    attns[r] = attn;
-+                }
-+
-+                float max_attn = hmax<float, float, VS>(attns);
-+                const simd<IT, VS> attn_exp = exp(attns - max_attn);
-+                barrier();
-+
-+                // copy v_cache to slm
-+                for (int r = start_row; r < end_row; ++r) {
-+                    int which_block = r / block_size;
-+                    int which_slot = r % block_size;
-+                    int physical_block_number = block_table[which_block];
-+
-+                    const IT * value_head = (const IT *)value + physical_block_number * kv_token_stride +
-+                      kv_head_idx * kv_head_stride +
-+                      which_slot * kv_block_stride;
-+
-+                    simd<IT, HD> value_row = block_load<IT, HD>(value_head);
-+                    slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), value_row);
-+                }
-+                barrier();
-+
-+                // attn @ v
-+                simd<IT, HD> accs = 0;
-+                for (int r = 0; r < row_num; ++r) {
-+                    simd<IT, HD> value_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
-+                    accs = accs + value_row * attn_exp[r];
-+                }
-+
-+                float softmax = sycl::ext::intel::esimd::detail::sum<float, float, VS>(attn_exp);
-+
-+                block_store<IT, HD>(o_accs_head + vid * HD, accs);
-+                block_store<float, 1>(o_a_s_head + vid * 2, max_attn);
-+                block_store<float, 1>(o_a_s_head + vid * 2 + 1, softmax);
-+            }
-+        );
-+    };
++  echo ${PREFILL_BASE_CMD}
++  echo ${DECODE_BASE_CMD}
++  sleep 2
 +
-+    utils::submit_kernel(cgf, device, "gqa kernel 1/2");
++  # execute on hosts
++  bash -c "${PREFILL_BASE_CMD}" &
++  bash -c "${DECODE_BASE_CMD}" &
++  sleep 1
++  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
++  sleep 1
++  wait_for_server ${DECODE_HOST} ${DECODE_PORT}
++  sleep 1
 +}
 +
-+template<typename IT, const int GS, const int HD>
-+void gqa_2_kernel(
-+    void * o_a_s,
-+    void * o_accs,
-+    void * output,
-+    const void* context_lens, // [num_seqs]
-+    const int64_t o_a_s_bsz_stride,
-+    const int64_t o_a_s_head_stride,
-+    const int64_t o_accs_bsz_stride,
-+    const int64_t o_accs_head_stride,
-+    const int64_t output_bsz_stride,
-+    const int64_t output_head_stride,
-+    const int bsz,
-+    const int num_heads,
-+    const int row_block_num,
-+    const at::Device & device
-+) {
-+    constexpr int SUB_HD = 8;
-+    static_assert(HD % SUB_HD == 0);
-+    static_assert(HD / SUB_HD <= GS);
-+
-+    const int sub_rows = row_block_num / GS;
-+    const int rem_rows = row_block_num % GS;
-+
-+    constexpr int accs_slm_offset = 0;
-+    constexpr int attn_slm_offset = GS * HD * sizeof(float);
-+    constexpr int softmax_slm_offset = attn_slm_offset + GS * sizeof(float);
-+
-+    sycl::range<3> global_size(bsz, num_heads, GS);
-+    sycl::range<3> local_size(1, 1, GS);
-+
-+    auto cgf = [&](sycl::handler& handle) {
-+        handle.parallel_for(
-+            sycl::nd_range<3>(global_size, local_size),
-+            [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+                slm_init<GS * HD * sizeof(float) + GS * 2 * sizeof(float)>();
-+
-+                const int bsz_idx = item.get_global_id(0);
-+                const int head_idx = item.get_global_id(1);
-+                const int tid = item.get_global_id(2);
-+
-+                const int* context_lens_ptr = (const int*)context_lens;
-+                const int context_length = context_lens_ptr[bsz_idx];
-+                constexpr int VS = 32;
-+                const int cur_row_block_num = (context_length + VS - 1) / VS;
-+                const int cur_sub_rows = cur_row_block_num / GS;
-+                const int cur_rem_rows = cur_row_block_num % GS;
-+
-+                const float * o_a_s_head = (const float *)o_a_s + bsz_idx * o_a_s_bsz_stride
-+                                                                + head_idx * o_a_s_head_stride;
-+                const IT * o_accs_head = (const IT *)o_accs + bsz_idx * o_accs_bsz_stride
-+                                                            + head_idx * o_accs_head_stride;
-+                IT * output_head = (IT *)output + bsz_idx * output_bsz_stride
-+                                                + head_idx * output_head_stride;
-+
-+                int start_row = std::min(tid * cur_sub_rows + std::min(tid, cur_rem_rows), cur_row_block_num);
-+                int end_row = std::min(start_row + cur_sub_rows + (tid < cur_rem_rows), cur_row_block_num);
-+
-+                float max_attn = -sycl::detail::max_v<float>();
-+                float softmax = 0;
-+                simd<float, HD> accs = 0;
-+                for (int r = start_row; r < end_row; ++r) {
-+                    float sub_attn = o_a_s_head[2 * r];
-+                    float sub_softmax = o_a_s_head[2 * r + 1];
-+                    simd<float, HD> sub_accs = block_load<IT, HD>(o_accs_head + r * HD);
-+                    float new_max_attn = std::max(max_attn, sub_attn);
-+                    float exp1 = exp(max_attn - new_max_attn);
-+                    float exp2 = exp(sub_attn - new_max_attn);
-+                    accs = accs * exp1 + sub_accs * exp2;
-+                    softmax = softmax * exp1 + sub_softmax * exp2;
-+                    max_attn = new_max_attn;
-+                }
-+
-+                slm_block_store<float, HD>(accs_slm_offset + tid * HD * sizeof(float), accs);
-+                slm_block_store<float, 1>(attn_slm_offset + tid * sizeof(float), max_attn);
-+                slm_block_store<float, 1>(softmax_slm_offset + tid * sizeof(float), softmax);
-+                barrier();
-+
-+                if (tid < HD / SUB_HD) {
-+                    simd<float, GS> max_attns = slm_block_load<float, GS>(attn_slm_offset);
-+                    const simd<float, GS> scales = exp(max_attns - hmax<float, float, GS>(max_attns));
-+                    simd<float, GS> softmaxs = slm_block_load<float, GS>(softmax_slm_offset);
-+                    float softmax_sum = sycl::ext::intel::esimd::detail::sum<float, float, GS>(softmaxs * scales);
-+
-+                    simd<float, SUB_HD> result = 0;
-+                    #pragma unroll
-+                    for (int r = 0; r < GS; ++r) {
-+                        simd<float, SUB_HD> sub_accs = slm_block_load<float, SUB_HD>(
-+                            accs_slm_offset + (r * HD + tid * SUB_HD) * sizeof(float)
-+                        );
-+                        result = result + sub_accs * scales[r];
-+                    }
-+                    result = result / softmax_sum;
-+                    block_store<IT, SUB_HD>(output_head + tid * SUB_HD, result);
-+                }
-+            }
-+        );
-+    };
-+
-+    utils::submit_kernel(cgf, device, "gqa kernel 2/2");
++launch_pd_proxy(){
++  PROXY_BASE_CMD="
++  python3 ${EXP_ROOT}/toy_proxy_server.py \
++  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
++  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
++  --host=${PROXY_HOST} --port ${PROXY_PORT}"
++  echo ${PROXY_BASE_CMD} 
++  bash -c "${PROXY_BASE_CMD}" &
++  sleep 2
 +}
 +
-+using AT = at::ScalarType;
-+using fp16 = sycl::half;
-+template<const int VS, const int GS, const int HD>
-+auto dispatch_gqa_kernel(AT it) {
-+    switch (it) {
-+        case AT::Float: return std::make_tuple(gqa_1_kernel<float, VS, HD>, gqa_2_kernel<float, GS, HD>);
-+        case AT::Half: return std::make_tuple(gqa_1_kernel<fp16, VS, HD>, gqa_2_kernel<fp16, GS, HD>);
-+        default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported");
-+    }
++run_tests(){
++  local service_url=$1
++  local mode=$2
++  python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
 +}
 +
-+void paged_attention_gqa(
-+    torch::Tensor output,
-+    torch::Tensor query,
-+    torch::Tensor key_cache,
-+    torch::Tensor value_cache,
-+    int64_t bsz,
-+    int64_t num_heads,
-+    int64_t num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int block_size,
-+    int64_t head_dim,
-+    int max_seq_len
-+) {
-+    constexpr int VS = 32;
-+    constexpr int GS = 32;
-+
-+    const int row_block_num = (max_seq_len + VS - 1) / VS;
-+    auto o_a_s = torch::empty({bsz, num_heads, 1, row_block_num * 2},
-+                              torch::device(query.device()).dtype(torch::kFloat32));
-+    auto o_accs = torch::empty({bsz, num_heads, 1, row_block_num * head_dim},
-+                               torch::device(query.device()).dtype(query.dtype()));
-+
-+    auto [func1, func2] = [&](){
-+        switch (head_dim) {
-+            case 128: return dispatch_gqa_kernel<VS, GS, 128>(query.scalar_type());
-+            case 96: return dispatch_gqa_kernel<VS, GS, 96>(query.scalar_type());
-+            case 80: return dispatch_gqa_kernel<VS, GS, 80>(query.scalar_type());
-+            case 64: return dispatch_gqa_kernel<VS, GS, 64>(query.scalar_type());
-+            default: throw std::runtime_error("unsupported head_dim, only 128, 96, 80 and 64 are supported");
-+        }
-+    }();
-+
-+    func1(
-+        query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
-+        block_tables.data_ptr(), context_lens.data_ptr(), o_a_s.data_ptr(), o_accs.data_ptr(),
-+        query.stride(0), query.stride(1), key_cache.stride(0), key_cache.stride(1), key_cache.stride(2), block_tables.stride(0),
-+        o_a_s.stride(0), o_a_s.stride(1), o_accs.stride(0), o_accs.stride(1),
-+        scale, block_size, bsz, num_heads, num_kv_heads, row_block_num,
-+        query.device()
-+    );
-+
-+    func2(
-+        o_a_s.data_ptr(), o_accs.data_ptr(), output.data_ptr(), context_lens.data_ptr(),
-+        o_a_s.stride(0), o_a_s.stride(1),
-+        o_accs.stride(0), o_accs.stride(1),
-+        output.stride(0), output.stride(1),
-+        bsz, num_heads, row_block_num,
-+        query.device()
-+    );
-+}
-diff --git a/csrc/xpu/attention_xpu_fp8.cpp b/csrc/xpu/attention_xpu_fp8.cpp
-new file mode 100644
-index 000000000..a2ea5819b
---- /dev/null
-+++ b/csrc/xpu/attention_xpu_fp8.cpp
-@@ -0,0 +1,324 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+#include <ext/intel/esimd.hpp>
-+#include "kv.h"
-+
-+// clang-format on
-+#include <float.h>
-+#include <torch/extension.h>
-+#include <stdexcept>
-+#include "utils.h"
-+#include "xpu_types.h"
-+// #include "dtype_bfloat16.dp.hpp"
-+#include "dtype_float16.h"
-+#include "dtype_float32.h"
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+#include <c10/xpu/XPUStream.h>
-+#endif
-+
-+#include <functional>
-+// #include <ipex.h>
-+
-+using namespace sycl::ext::intel::esimd;
-+using AT = at::ScalarType;
-+
-+template <typename IT, const int VS, const int HD>
-+void gqa_1_kernel_fp8(
-+    const void* query,  // [num_seqs, num_heads, head_size]
-+    const void* key,    // [num_blocks, num_kv_heads, head_size, block_size]
-+    const void* value,  // [num_blocks, num_kv_heads, head_size, block_size]
-+    const void* block_tables,  // [num_seqs, max_num_blocks_per_seq]
-+    const void* context_lens,  // [num_seqs]
-+    void* o_a_s, void* o_accs, const int64_t query_bsz_stride,
-+    const int64_t query_head_stride, const int64_t kv_token_stride,
-+    const int64_t kv_head_stride, const int64_t kv_block_stride,
-+    const int64_t block_table_stride_batch, const int64_t o_a_s_bsz_stride,
-+    const int64_t o_a_s_head_stride, const int64_t o_accs_bsz_stride,
-+    const int64_t o_accs_head_stride, const float scale, const int block_size,
-+    const int bsz, const int num_heads, const int num_kv_heads,
-+    const int block_num, const at::Device& device) {
-+  const int group_size = num_heads / num_kv_heads;
-+  const int sub_rows = VS / group_size;
-+  const int rem_rows = VS % group_size;
-+
-+  const float attn_scale = scale;
-+
-+  sycl::range<3> global_size(bsz, num_heads, block_num);
-+  sycl::range<3> local_size(1, group_size, 1);
-+
-+  auto cgf = [&](sycl::handler& handle) {
-+    handle.parallel_for(
-+        sycl::nd_range<3>(global_size, local_size),
-+        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+          slm_init<VS * HD * sizeof(IT)>();
-+
-+          const int bsz_idx = item.get_global_id(0);
-+          const int head_idx = item.get_global_id(1);
-+          const int kv_head_idx = item.get_group(1);
-+          const int tid = item.get_local_id(1);
-+          const int vid = item.get_global_id(2);
-+
-+          const IT* query_head = (const IT*)query + bsz_idx * query_bsz_stride +
-+                                 head_idx * query_head_stride;
-+
-+          IT* o_accs_head = (IT*)o_accs + bsz_idx * o_accs_bsz_stride +
-+                            head_idx * o_accs_head_stride;
-+          float* o_a_s_head = (float*)o_a_s + bsz_idx * o_a_s_bsz_stride +
-+                              head_idx * o_a_s_head_stride;
-+
-+          const int* block_tables_ptr = (const int*)block_tables;
-+          const int* block_table =
-+              block_tables_ptr + bsz_idx * block_table_stride_batch;
-+
-+          const int* context_lens_ptr = (const int*)context_lens;
-+          const int context_length = context_lens_ptr[bsz_idx];
-+
-+          simd<IT, HD> query_row = block_load<IT, HD>(query_head) * attn_scale;
-+
-+          // copy k_cache to slm
-+          int start_row =
-+              std::min(vid * VS + tid * sub_rows + std::min(tid, rem_rows),
-+                       context_length);
-+          int end_row =
-+              std::min(start_row + sub_rows + (tid < rem_rows), context_length);
-+          for (int r = start_row; r < end_row; ++r) {
-+            int which_block = r / block_size;
-+            int which_slot = r % block_size;
-+            int physical_block_number = block_table[which_block];
-+
-+            // Load elements in uint8_t
-+            const uint8_t* key_head =
-+                (const uint8_t*)key + physical_block_number * kv_token_stride +
-+                kv_head_idx * kv_head_stride + which_slot * kv_block_stride;
-+
-+            simd<uint8_t, HD> key_row = block_load<uint8_t, HD>(key_head);
-+            simd<IT, HD> key_dequantized = dequantize_key_row<HD>(key_row);
-+            slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), key_dequantized);
-+          }
-+          barrier();
-+
-+          simd<float, VS> attns = -sycl::detail::max_v<float>();
-+          int row_num =
-+              (vid + 1) * VS > context_length ? context_length % VS : VS;
-+          // q @ k
-+          for (int r = 0; r < row_num; ++r) {
-+            simd<IT, HD> key_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
-+            float attn = sycl::ext::intel::esimd::detail::sum<float, IT, HD>(
-+                query_row * key_row);
-+            attns[r] = attn;
-+          }
-+
-+          float max_attn = hmax<float, float, VS>(attns);
-+          const simd<IT, VS> attn_exp = exp(attns - max_attn);
-+          barrier();
-+
-+          // copy v_cache to slm
-+          for (int r = start_row; r < end_row; ++r) {
-+            int which_block = r / block_size;
-+            int which_slot = r % block_size;
-+            int physical_block_number = block_table[which_block];
-+
-+            const uint8_t* value_head =
-+                (const uint8_t*)value + physical_block_number * kv_token_stride +
-+                kv_head_idx * kv_head_stride + which_slot * kv_block_stride;
-+
-+            simd<uint8_t, HD> value_row = block_load<uint8_t, HD>(value_head);
-+            simd<IT, HD> value_dequantized = dequantize_value_row<HD>(value_row);
-+            slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT),
-+                                    value_dequantized);
-+          }
-+          barrier();
-+
-+          // attn @ v
-+          simd<IT, HD> accs = 0;
-+          for (int r = 0; r < row_num; ++r) {
-+            simd<IT, HD> value_row =
-+                slm_block_load<IT, HD>(r * HD * sizeof(IT));
-+            accs = accs + value_row * attn_exp[r];
-+          }
-+
-+          float softmax =
-+              sycl::ext::intel::esimd::detail::sum<float, float, VS>(attn_exp);
-+
-+          block_store<IT, HD>(o_accs_head + vid * HD, accs);
-+          block_store<float, 1>(o_a_s_head + vid * 2, max_attn);
-+          block_store<float, 1>(o_a_s_head + vid * 2 + 1, softmax);
-+        });
-+  };
-+
-+  utils::submit_kernel(cgf, device, "gqa kernel 1/2");
-+}
 +
-+template <typename IT, const int GS, const int HD>
-+void gqa_2_kernel_fp8(void* o_a_s, void* o_accs, void* output,
-+                  const void* context_lens,  // [num_seqs]
-+                  const int64_t o_a_s_bsz_stride,
-+                  const int64_t o_a_s_head_stride,
-+                  const int64_t o_accs_bsz_stride,
-+                  const int64_t o_accs_head_stride,
-+                  const int64_t output_bsz_stride,
-+                  const int64_t output_head_stride, const int bsz,
-+                  const int num_heads, const int row_block_num,
-+                  const at::Device& device) {
-+  constexpr int SUB_HD = 8;
-+  static_assert(HD % SUB_HD == 0);
-+  static_assert(HD / SUB_HD <= GS);
-+
-+  const int sub_rows = row_block_num / GS;
-+  const int rem_rows = row_block_num % GS;
-+
-+  constexpr int accs_slm_offset = 0;
-+  constexpr int attn_slm_offset = GS * HD * sizeof(float);
-+  constexpr int softmax_slm_offset = attn_slm_offset + GS * sizeof(float);
-+
-+  sycl::range<3> global_size(bsz, num_heads, GS);
-+  sycl::range<3> local_size(1, 1, GS);
-+
-+  auto cgf = [&](sycl::handler& handle) {
-+    handle.parallel_for(
-+        sycl::nd_range<3>(global_size, local_size),
-+        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
-+          slm_init<GS * HD * sizeof(float) + GS * 2 * sizeof(float)>();
-+
-+          const int bsz_idx = item.get_global_id(0);
-+          const int head_idx = item.get_global_id(1);
-+          const int tid = item.get_global_id(2);
-+
-+          const int* context_lens_ptr = (const int*)context_lens;
-+          const int context_length = context_lens_ptr[bsz_idx];
-+          constexpr int VS = 32;
-+          const int cur_row_block_num = (context_length + VS - 1) / VS;
-+          const int cur_sub_rows = cur_row_block_num / GS;
-+          const int cur_rem_rows = cur_row_block_num % GS;
-+
-+          const float* o_a_s_head = (const float*)o_a_s +
-+                                    bsz_idx * o_a_s_bsz_stride +
-+                                    head_idx * o_a_s_head_stride;
-+          const IT* o_accs_head = (const IT*)o_accs +
-+                                  bsz_idx * o_accs_bsz_stride +
-+                                  head_idx * o_accs_head_stride;
-+          IT* output_head = (IT*)output + bsz_idx * output_bsz_stride +
-+                            head_idx * output_head_stride;
-+
-+          int start_row =
-+              std::min(tid * cur_sub_rows + std::min(tid, cur_rem_rows),
-+                       cur_row_block_num);
-+          int end_row =
-+              std::min(start_row + cur_sub_rows + (tid < cur_rem_rows),
-+                       cur_row_block_num);
-+
-+          float max_attn = -sycl::detail::max_v<float>();
-+          float softmax = 0;
-+          simd<float, HD> accs = 0;
-+          for (int r = start_row; r < end_row; ++r) {
-+            float sub_attn = o_a_s_head[2 * r];
-+            float sub_softmax = o_a_s_head[2 * r + 1];
-+            simd<float, HD> sub_accs = block_load<IT, HD>(o_accs_head + r * HD);
-+            float new_max_attn = std::max(max_attn, sub_attn);
-+            float exp1 = exp(max_attn - new_max_attn);
-+            float exp2 = exp(sub_attn - new_max_attn);
-+            accs = accs * exp1 + sub_accs * exp2;
-+            softmax = softmax * exp1 + sub_softmax * exp2;
-+            max_attn = new_max_attn;
-+          }
-+
-+          slm_block_store<float, HD>(accs_slm_offset + tid * HD * sizeof(float),
-+                                     accs);
-+          slm_block_store<float, 1>(attn_slm_offset + tid * sizeof(float),
-+                                    max_attn);
-+          slm_block_store<float, 1>(softmax_slm_offset + tid * sizeof(float),
-+                                    softmax);
-+          barrier();
-+
-+          if (tid < HD / SUB_HD) {
-+            simd<float, GS> max_attns =
-+                slm_block_load<float, GS>(attn_slm_offset);
-+            const simd<float, GS> scales =
-+                exp(max_attns - hmax<float, float, GS>(max_attns));
-+            simd<float, GS> softmaxs =
-+                slm_block_load<float, GS>(softmax_slm_offset);
-+            float softmax_sum =
-+                sycl::ext::intel::esimd::detail::sum<float, float, GS>(
-+                    softmaxs * scales);
-+
-+            simd<float, SUB_HD> result = 0;
-+#pragma unroll
-+            for (int r = 0; r < GS; ++r) {
-+              simd<float, SUB_HD> sub_accs = slm_block_load<float, SUB_HD>(
-+                  accs_slm_offset + (r * HD + tid * SUB_HD) * sizeof(float));
-+              result = result + sub_accs * scales[r];
-+            }
-+            result = result / softmax_sum;
-+            block_store<IT, SUB_HD>(output_head + tid * SUB_HD, result);
-+          }
-+        });
-+  };
-+
-+  utils::submit_kernel(cgf, device, "gqa kernel 2/2");
-+}
-+
-+template <const int VS, const int GS, const int HD>
-+auto dispatch_gqa_kernel_fp8(AT it) {
-+  switch (it) {
-+    case AT::Float:
-+      return std::make_tuple(gqa_1_kernel_fp8<float, VS, HD>,
-+                             gqa_2_kernel_fp8<float, GS, HD>);
-+    case AT::Half:
-+      return std::make_tuple(gqa_1_kernel_fp8<fp16, VS, HD>,
-+                             gqa_2_kernel_fp8<fp16, GS, HD>);
-+    default:
-+      throw std::runtime_error(
-+          "unsupported dtype, only fp32 and fp16 are supported");
-+  }
-+}
-+
-+void paged_attention_gqa_fp8(torch::Tensor output, torch::Tensor query,
-+                         torch::Tensor key_cache, torch::Tensor value_cache,
-+                         int64_t bsz, int64_t num_heads, int64_t num_kv_heads,
-+                         float scale, torch::Tensor& block_tables,
-+                         torch::Tensor& context_lens, int block_size,
-+                         int64_t head_dim, int max_seq_len) {
-+  constexpr int VS = 32;
-+  constexpr int GS = 32;
-+
-+  const int row_block_num = (max_seq_len + VS - 1) / VS;
-+  auto o_a_s =
-+      torch::empty({bsz, num_heads, 1, row_block_num * 2},
-+                   torch::device(query.device()).dtype(torch::kFloat32));
-+  auto o_accs =
-+      torch::empty({bsz, num_heads, 1, row_block_num * head_dim},
-+                   torch::device(query.device()).dtype(query.dtype()));
-+
-+  auto [func1, func2] = [&]() {
-+    switch (head_dim) {
-+      case 128:
-+        return dispatch_gqa_kernel_fp8<VS, GS, 128>(query.scalar_type());
-+      case 96:
-+        return dispatch_gqa_kernel_fp8<VS, GS, 96>(query.scalar_type());
-+      case 80:
-+        return dispatch_gqa_kernel_fp8<VS, GS, 80>(query.scalar_type());
-+      case 64:
-+        return dispatch_gqa_kernel_fp8<VS, GS, 64>(query.scalar_type());
-+      default:
-+        throw std::runtime_error(
-+            "unsupported head_dim, only 128, 96, 80 and 64 are supported");
-+    }
-+  }();
-+
-+  func1(query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
-+        block_tables.data_ptr(), context_lens.data_ptr(), o_a_s.data_ptr(),
-+        o_accs.data_ptr(), query.stride(0), query.stride(1),
-+        key_cache.stride(0), key_cache.stride(1), key_cache.stride(2),
-+        block_tables.stride(0), o_a_s.stride(0), o_a_s.stride(1),
-+        o_accs.stride(0), o_accs.stride(1), scale, block_size, bsz, num_heads,
-+        num_kv_heads, row_block_num, query.device());
-+
-+  func2(o_a_s.data_ptr(), o_accs.data_ptr(), output.data_ptr(),
-+        context_lens.data_ptr(), o_a_s.stride(0), o_a_s.stride(1),
-+        o_accs.stride(0), o_accs.stride(1), output.stride(0), output.stride(1),
-+        bsz, num_heads, row_block_num, query.device());
-+}
-diff --git a/csrc/xpu/base.hpp b/csrc/xpu/base.hpp
-new file mode 100644
-index 000000000..c364c62e6
---- /dev/null
-+++ b/csrc/xpu/base.hpp
-@@ -0,0 +1,118 @@
-+#pragma once
-+
-+#include <sycl.hpp>
-+#include <sycl/ext/intel/esimd.hpp>
-+
-+#include "common.h"
-+
-+using namespace sycl::ext::intel::esimd;
-+using fp16 = sycl::half;
-+
-+constexpr int QK = 64;
-+constexpr int SBS = 4;
-+
-+constexpr int BLOCK_SIZES[GGML_TYPE_COUNT] = {
-+    [GGML_TYPE_Q4_0]     = QK / 2,
-+    [GGML_TYPE_Q4_0_WOQ] = QK / 2,
-+    [GGML_TYPE_FP8E5]  = QK,
-+};
-+
-+constexpr int SCALE_SIZES[GGML_TYPE_COUNT] = {
-+    [GGML_TYPE_Q4_0]     = sizeof(fp16),
-+    [GGML_TYPE_Q4_0_WOQ] = sizeof(fp16),
-+    [GGML_TYPE_FP8E5]  = 0,
-+};
-+
-+template<int QTYPE>
-+ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale);
-+
-+template<>
-+ESIMD_INLINE auto load_qblocks<GGML_TYPE_Q4_0>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0];
-+    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
-+    const simd<fp16, SBS> scales = block_load<fp16, SBS>((const fp16 *)scale);
-+
-+    simd<fp16, QK * SBS> yvs;
-+    #pragma unroll
-+    for (int i = 0; i < SBS; ++i) {
-+        simd<uint8_t, QK> uyv;
-+        uyv.select<QK / 2, 1>(0) = ybytes.template select<QK / 2, 1>(i * QK / 2) & (uint8_t)0xF;
-+        uyv.select<QK / 2, 1>(QK / 2) = ybytes.template select<QK / 2, 1>(i * QK / 2) >> (uint8_t)4;
-+        yvs.template select<QK, 1>(i * QK) = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales[i];
-+    }
-+    return yvs;
-+}
-+
-+template<>
-+ESIMD_INLINE auto load_qblocks<GGML_TYPE_Q4_0_WOQ>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ];
-+    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
-+    const simd<fp16, SBS> scales = block_load<fp16, SBS>((const fp16 *)scale);
-+
-+    simd<fp16, QK * SBS> yvs;
-+    #pragma unroll
-+    for (int i = 0; i < SBS; ++i) {
-+        simd<uint8_t, QK> uyv;
-+        uyv.select<QK / 2, 2>(0) = ybytes.template select<QK / 2, 1>(i * QK / 2) & (uint8_t)0xF;
-+        uyv.select<QK / 2, 2>(1) = ybytes.template select<QK / 2, 1>(i * QK / 2) >> (uint8_t)4;
-+        yvs.template select<QK, 1>(i * QK) = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales[i];
-+    }
-+    return yvs;
-+}
++# run non-disagg. baseline & save outputs
++launch_baseline
++run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
++cleanup
++sleep 10
 +
 +
-+template<>
-+ESIMD_INLINE auto load_qblocks<GGML_TYPE_FP8E5>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5];
-+    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
++# run disagg. & do exact-match with the outputs from baseline
++launch_pd
++launch_pd_proxy
++run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
++echo "-----P/D success----"
 +
-+    simd<fp16, QK * SBS> yvs;
-+    yvs.template bit_cast_view<uint8_t>().template select<QK * SBS, 2>(0) = 0x80;
-+    yvs.template bit_cast_view<uint8_t>().template select<QK * SBS, 2>(1) = ybytes;
-+    return yvs;
-+}
++rm ${OUTPUT_FILE}
++cleanup
 +
++exit 0
+diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
+index c2868c040..f6627808c 100644
+--- a/vllm/_ipex_ops.py
++++ b/vllm/_ipex_ops.py
+@@ -207,6 +207,12 @@ class ipex_ops:
+                                                  is_causal, return_softmax,
+                                                  gen_)
+         else:  # XPU build
++            if max_seqlen_q is None:
++                assert seqlen_q is not None
++                max_seqlen_q = int((seqlen_q[1:] - seqlen_q[:-1]).max().item())
++            if max_seqlen_k is None:
++                assert seqlen_k is not None
++                max_seqlen_k = int((seqlen_k[1:] - seqlen_k[:-1]).max().item())
+             ipex.llm.functional.varlen_attention(
+                 query.contiguous(), key.contiguous(), value.contiguous(), out,
+                 seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
+@@ -300,6 +306,7 @@ class ipex_ops:
+             causal,
+             block_table,
+             alibi_slopes,
++            sink=s_aux,
+             softcap=softcap,
+             window_size_left=real_window_size[0],
+             window_size_right=real_window_size[1],
+diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
+index bb05b468f..f1d657315 100644
+--- a/vllm/attention/layer.py
++++ b/vllm/attention/layer.py
+@@ -23,6 +23,7 @@ from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.model_executor.models.vision import get_vit_attn_backend
+ from vllm.platforms import _Backend, current_platform
+ from vllm.utils import direct_register_custom_op
+ 
+@@ -30,6 +31,15 @@ logger = init_logger(__name__)
+ USE_XFORMERS_OPS = None
+ 
+ 
 +
-+// C++ doesn't support function template partial specialization, so write a new version for SBS=1
-+template<int QTYPE>
-+ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale);
++def check_upstream_fa_availability(dtype: torch.dtype):
++    if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
++    ) and current_platform.has_device_capability(80):
++        from transformers.utils import is_flash_attn_2_available
++        return is_flash_attn_2_available()
++    return False
 +
-+template<>
-+ESIMD_INLINE auto load_qblock<GGML_TYPE_Q4_0>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0];
-+    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
-+    fp16 scales = *(const fp16 *)scale;
 +
-+    simd<uint8_t, QK> uyv;
-+    uyv.select<QK / 2, 1>(0) = ybytes & (uint8_t)0xF;
-+    uyv.select<QK / 2, 1>(QK / 2) = ybytes >> (uint8_t)4;
-+    simd<fp16, QK> yv = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales;
+ def check_xformers_availability():
+     global USE_XFORMERS_OPS
+     if USE_XFORMERS_OPS is not None:
+@@ -349,29 +359,55 @@ class MultiHeadAttention(nn.Module):
+             f"divisible by num_kv_heads ({self.num_kv_heads})"
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+ 
++        # During model initialization, the default dtype is set as the model
++        # weight and activation dtype.
+         dtype = torch.get_default_dtype()
+-        attn_backend = get_attn_backend(head_size,
+-                                        dtype,
+-                                        kv_cache_dtype=None,
+-                                        block_size=16,
+-                                        is_attention_free=False)
+-        backend = backend_name_to_enum(attn_backend.get_name())
++
++        # Determine the attention backend
++        backend = get_vit_attn_backend(head_size=head_size, dtype=dtype)
++
++        # Some auto-selected backends can be upgraded
++        # to upstream flash attention if available.
++        # If vllm native fa is selected, we use it directly.
++        use_upstream_fa = False
++        if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
++                dtype):
++            backend = _Backend.FLASH_ATTN
++            use_upstream_fa = True
++
+         if current_platform.is_rocm():
+             # currently, only torch_sdpa is supported on rocm
+             self.attn_backend = _Backend.TORCH_SDPA
+         else:
 +
-+    return yv;
-+}
+             self.attn_backend = backend if backend in {
+                 _Backend.TORCH_SDPA,
+                 _Backend.TORCH_SDPA_VLLM_V1,
+                 _Backend.XFORMERS,
+                 _Backend.PALLAS_VLLM_V1,
+                 _Backend.ROCM_AITER_FA,
+-            } else current_platform.get_vit_attn_backend()
++                _Backend.FLASH_ATTN,
++                _Backend.FLASH_ATTN_VLLM_V1,
++            } else _Backend.TORCH_SDPA
+ 
+         if (self.attn_backend == _Backend.XFORMERS
+                 and not check_xformers_availability()):
+             self.attn_backend = _Backend.TORCH_SDPA
+ 
++        if self.attn_backend in {
++                _Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1
++        }:
++            if use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++                self._flash_attn_varlen_func = flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
++                self._flash_attn_varlen_func = flash_attn_varlen_func
 +
-+template<>
-+ESIMD_INLINE auto load_qblock<GGML_TYPE_Q4_0_WOQ>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ];
-+    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
-+    fp16 scales = *(const fp16 *)scale;
++        logger.info_once(
++            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
++            f"use_upstream_fa: {use_upstream_fa}")
 +
-+    simd<uint8_t, QK> uyv;
-+    uyv.select<QK / 2, 2>(0) = ybytes & (uint8_t)0xF;
-+    uyv.select<QK / 2, 2>(1) = ybytes >> (uint8_t)4;
-+    simd<fp16, QK> yv = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales;
+     def forward(
+         self,
+         query: torch.Tensor,
+@@ -380,7 +416,7 @@ class MultiHeadAttention(nn.Module):
+     ) -> torch.Tensor:
+         """Input shape: batch_size x seq_len x hidden_size"""
+         # TODO(Isotr0py): Use existing backend implementations and support FA3
+-        bsz, q_len, _ = query.size()
++        bsz, q_len = query.size()[:2]
+         kv_len = key.size(1)
+ 
+         query = query.view(bsz, q_len, self.num_heads, self.head_size)
+@@ -392,7 +428,31 @@ class MultiHeadAttention(nn.Module):
+             key = torch.repeat_interleave(key, num_repeat, dim=2)
+             value = torch.repeat_interleave(value, num_repeat, dim=2)
+ 
+-        if self.attn_backend == _Backend.XFORMERS:
++        if self.attn_backend in {
++                _Backend.FLASH_ATTN,
++                _Backend.FLASH_ATTN_VLLM_V1,
++        }:
++
++            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
++                                        step=q_len,
++                                        dtype=torch.int32,
++                                        device=query.device)
++            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
++                                        step=kv_len,
++                                        dtype=torch.int32,
++                                        device=key.device)
 +
-+    return yv;
-+}
++            out = self._flash_attn_varlen_func(
++                query.flatten(0, 1),
++                key.flatten(0, 1),
++                value.flatten(0, 1),
++                cu_seqlens_q=cu_seqlens_q,
++                cu_seqlens_k=cu_seqlens_k,
++                max_seqlen_q=q_len,
++                max_seqlen_k=kv_len,
++                softmax_scale=self.scale,
++            )
++        elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
+ 
+             out = xops.memory_efficient_attention_forward(query,
+@@ -400,7 +460,8 @@ class MultiHeadAttention(nn.Module):
+                                                           value,
+                                                           scale=self.scale)
+         elif (self.attn_backend == _Backend.TORCH_SDPA
+-              or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1):
++              or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1
++              or self.attn_backend == _Backend.IPEX):
+             query, key, value = (x.transpose(1, 2)
+                                  for x in (query, key, value))
+             out = F.scaled_dot_product_attention(query,
+diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
+index a98eb2a78..14095ca4d 100644
+--- a/vllm/benchmarks/serve.py
++++ b/vllm/benchmarks/serve.py
+@@ -430,7 +430,8 @@ async def benchmark(
+     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+         input_requests[0].prompt,
+         input_requests[0].prompt_len,
+-        input_requests[0].expected_output_len,
++        #input_requests[0].expected_output_len,
++        10,
+         input_requests[0].multi_modal_data,
+     )
+ 
+diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
+index 067315deb..b236bae26 100644
+--- a/vllm/distributed/device_communicators/xpu_communicator.py
++++ b/vllm/distributed/device_communicators/xpu_communicator.py
+@@ -25,6 +25,12 @@ class XpuCommunicator(DeviceCommunicatorBase):
+         super().__init__(cpu_group, device, device_group, unique_name)
+         if self.use_all2all:
+             all2all_backend = envs.VLLM_ALL2ALL_BACKEND
++            if all2all_backend != "naive":
++                logger.warning(
++                    "`%s` all2all manager is not supported on XPU."
++                    "Falling back to `naive` all2all manager for XPU.",
++                    all2all_backend)
++                all2all_backend = "naive"
+             if all2all_backend == "naive":
+                 from .all2all import NaiveAll2AllManager
+                 self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+@@ -67,3 +73,16 @@ class XpuCommunicator(DeviceCommunicatorBase):
+ 
+     def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
+         dist.broadcast(input_, src=src, group=self.device_group)
++
++    def dispatch(
++            self, hidden_states: torch.Tensor,
++            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
++        assert self.all2all_manager is not None
++        hidden_states, router_logits = self.all2all_manager.dispatch(
++            hidden_states, router_logits)
++        return hidden_states, router_logits
++
++    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        assert self.all2all_manager is not None
++        hidden_states = self.all2all_manager.combine(hidden_states)
++        return hidden_states
+diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
+index b53dbfb3a..48d205856 100644
+--- a/vllm/entrypoints/chat_utils.py
++++ b/vllm/entrypoints/chat_utils.py
+@@ -431,6 +431,51 @@ def resolve_mistral_chat_template(
+     return None
+ 
+ 
++_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
++"""
++Used in `_try_get_processor_chat_template` to avoid calling
++`cached_get_processor` again if the processor fails to be loaded.
 +
++This is needed because `lru_cache` does not cache when an exception happens.
++"""
 +
-+template<>
-+ESIMD_INLINE auto load_qblock<GGML_TYPE_FP8E5>(const uint8_t * weight, const uint8_t * scale) {
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5];
-+    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
 +
-+    simd<fp16, QK> yvs;
-+    yvs.template bit_cast_view<uint8_t>().template select<QK, 2>(0) = 0x80;
-+    yvs.template bit_cast_view<uint8_t>().template select<QK, 2>(1) = ybytes;
-+    return yvs;
-+}
-diff --git a/csrc/xpu/cache_ops_xpu.cpp b/csrc/xpu/cache_ops_xpu.cpp
-new file mode 100644
-index 000000000..a3451c0e7
---- /dev/null
-+++ b/csrc/xpu/cache_ops_xpu.cpp
-@@ -0,0 +1,579 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+#include <ext/intel/esimd.hpp>
-+// clang-format on
-+#include "xpu_types.h"
-+
-+#include <torch/extension.h>
-+#include "utils.h"
-+
-+using fp16 = sycl::half;
-+using namespace sycl::ext::intel::esimd;
-+
-+template <typename scalar_t>
-+void reshape_and_cache_kernel(
-+    const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
-+    const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x,
-+                                      // block_size, x]
-+    scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size,
-+                                        // block_size]
-+    const int64_t* __restrict__ slot_mapping, // [num_tokens]
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  const int64_t slot_idx = slot_mapping[token_idx];
-+  if (slot_idx < 0) {
-+    // Padding token that should be ignored.
-+    return;
-+  }
-+
-+  const int64_t block_idx = slot_idx / block_size;
-+  const int64_t block_offset = slot_idx % block_size;
-+
-+  const int n = num_heads * head_size;
-+  for (int i = item_ct1.get_local_id(2); i < n;
-+       i += item_ct1.get_local_range(2)) {
-+    const int64_t src_key_idx = token_idx * key_stride + i;
-+    const int64_t src_value_idx = token_idx * value_stride + i;
-+
-+    const int head_idx = i / head_size;
-+    const int head_offset = i % head_size;
-+    const int x_idx = head_offset / x;
-+    const int x_offset = head_offset % x;
-+
-+    const int64_t tgt_key_idx =
-+        block_idx * num_heads * (head_size / x) * block_size * x +
-+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-+        block_offset * x + x_offset;
-+    const int64_t tgt_value_idx =
-+        block_idx * num_heads * head_size * block_size +
-+        head_idx * head_size * block_size + head_offset * block_size +
-+        block_offset;
-+    key_cache[tgt_key_idx] = key[src_key_idx];
-+    value_cache[tgt_value_idx] = value[src_value_idx];
-+  }
-+}
++def _try_get_processor_chat_template(
++    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
++    model_config: ModelConfig,
++) -> Optional[str]:
++    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
++    if cache_key in _PROCESSOR_CHAT_TEMPLATES:
++        return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 +
-+template <typename scalar_t>
-+void call_reshape_and_cache_kernel(
-+    const scalar_t* __restrict__ key,
-+    const scalar_t* __restrict__ value,
-+    scalar_t* __restrict__ key_cache,
-+    scalar_t* __restrict__ value_cache,
-+    const int64_t* __restrict__ slot_mapping,
-+    const int num_tokens,
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          reshape_and_cache_kernel<sycl_t>(
-+              (const sycl_t* __restrict__)key,
-+              (const sycl_t* __restrict__)value,
-+              (sycl_t* __restrict__)key_cache,
-+              (sycl_t* __restrict__)value_cache,
-+              slot_mapping,
-+              key_stride,
-+              value_stride,
-+              num_heads,
-+              head_size,
-+              block_size,
-+              x,
-+              item_ct1);
-+        });
-+  });
-+}
++    try:
++        processor = cached_get_processor(
++            tokenizer.name_or_path,
++            processor_cls=(
++                PreTrainedTokenizer,
++                PreTrainedTokenizerFast,
++                ProcessorMixin,
++            ),
++            trust_remote_code=model_config.trust_remote_code,
++        )
++        if (
++            isinstance(processor, ProcessorMixin)
++            and hasattr(processor, "chat_template")
++            and (chat_template := processor.chat_template) is not None
++        ):
++            _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
++            return chat_template
++    except Exception:
++        logger.debug(
++            "Failed to load AutoProcessor chat template for %s",
++            tokenizer.name_or_path,
++            exc_info=True,
++        )
 +
-+void reshape_and_cache(
-+    torch::Tensor& key,
-+    torch::Tensor& value,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    torch::Tensor& slot_mapping,
-+    const std::string& kv_cache_dtype,
-+    const float kv_scale) {
-+  int num_tokens = key.size(0);
-+  int num_heads = key.size(1);
-+  int head_size = key.size(2);
-+  int block_size = key_cache.size(3);
-+  int x = key_cache.size(4);
-+
-+  int key_stride = key.stride(0);
-+  int value_stride = value.stride(0);
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      key.scalar_type(), "call_reshape_and_cache_kernel", [&] {
-+        call_reshape_and_cache_kernel<scalar_t>(
-+            key.data_ptr<scalar_t>(),
-+            value.data_ptr<scalar_t>(),
-+            key_cache.data_ptr<scalar_t>(),
-+            value_cache.data_ptr<scalar_t>(),
-+            slot_mapping.data_ptr<int64_t>(),
-+            num_tokens,
-+            key_stride,
-+            value_stride,
-+            num_heads,
-+            head_size,
-+            block_size,
-+            x);
-+      });
-+}
++    _PROCESSOR_CHAT_TEMPLATES[cache_key] = None
++    return None
++
++
+ def resolve_hf_chat_template(
+     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+     chat_template: Optional[str],
+@@ -444,28 +489,10 @@ def resolve_hf_chat_template(
+ 
+     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
+     if tools is None:
+-        try:
+-            processor = cached_get_processor(
+-                tokenizer.name_or_path,
+-                processor_cls=(
+-                    PreTrainedTokenizer,
+-                    PreTrainedTokenizerFast,
+-                    ProcessorMixin,
+-                ),
+-                trust_remote_code=model_config.trust_remote_code,
+-            )
+-            if (
+-                isinstance(processor, ProcessorMixin)
+-                and hasattr(processor, "chat_template")
+-                and processor.chat_template is not None
+-            ):
+-                return processor.chat_template
+-        except Exception:
+-            logger.debug(
+-                "Failed to load AutoProcessor chat template for %s",
+-                tokenizer.name_or_path,
+-                exc_info=True,
+-            )  # noqa: E501
++        chat_template = _try_get_processor_chat_template(tokenizer,
++                                                         model_config)
++        if chat_template is not None:
++            return chat_template
+ 
+     # 3rd priority: AutoTokenizer chat template
+     try:
+diff --git a/vllm/envs.py b/vllm/envs.py
+index ac770ac4c..487fdcbfa 100755
+--- a/vllm/envs.py
++++ b/vllm/envs.py
+@@ -70,7 +70,6 @@ if TYPE_CHECKING:
+     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+     VLLM_MM_INPUT_CACHE_GIB: int = 4
+     VLLM_TARGET_DEVICE: str = "cuda"
+-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+     MAX_JOBS: Optional[str] = None
+     NVCC_THREADS: Optional[str] = None
+     VLLM_USE_PRECOMPILED: bool = False
+@@ -176,6 +175,8 @@ if TYPE_CHECKING:
+     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
+     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
+     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
++    VLLM_XPU_FP8_DTYPE: str = "e5m2"
++    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = False
+ 
+ 
+ def get_default_cache_root():
+@@ -247,11 +248,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
+     "VLLM_TARGET_DEVICE":
+     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
+ 
+-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+-    # 12.8 is the default. This follows PyTorch but can be overridden.
+-    "VLLM_MAIN_CUDA_VERSION":
+-    lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8",
+-
+     # Maximum number of compilation jobs to run in parallel.
+     # By default this is the number of CPUs
+     "MAX_JOBS":
+@@ -1247,6 +1243,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
+     # raw bytes. Defaults to True for backward compatibility.
+     "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES":
+     lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))),
 +
-+template <typename scalar_t>
-+void reshape_and_cache_ipexllm_kernel(
-+    const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
-+    const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
-+    scalar_t* __restrict__ value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
-+    const int64_t* __restrict__ slot_mapping, // [num_tokens]
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  const int64_t slot_idx = slot_mapping[token_idx];
-+  if (slot_idx < 0) {
-+    // Padding token that should be ignored.
-+    return;
-+  }
-+
-+  const int64_t block_idx = slot_idx / block_size;
-+  const int64_t block_offset = slot_idx % block_size;
-+
-+  const int n = num_heads * head_size;
-+  for (int i = item_ct1.get_local_id(2); i < n;
-+       i += item_ct1.get_local_range(2)) {
-+    const int64_t src_key_idx = token_idx * key_stride + i;
-+    const int64_t src_value_idx = token_idx * value_stride + i;
-+
-+    const int head_idx = i / head_size;
-+    const int head_offset = i % head_size;
-+
-+    // const int64_t tgt_key_idx =
-+    //     block_idx * num_heads * (head_size / x) * block_size * x +
-+    //     head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-+    //     block_offset * x + x_offset;
-+
-+    // const int64_t tgt_value_idx =
-+    //     block_idx * num_heads * head_size * block_size +
-+    //     head_idx * head_size * block_size + head_offset * block_size +
-+    //     block_offset;
-+
-+    const int64_t tgt_value_idx =
-+        block_idx * num_heads * head_size * block_size +
-+        head_idx * head_size * block_size + 
-+        block_offset * head_size + 
-+        head_offset;
-+    const int64_t tgt_key_idx = tgt_value_idx;
-+    key_cache[tgt_key_idx] = key[src_key_idx];
-+    value_cache[tgt_value_idx] = value[src_value_idx];
-+  }
-+}
++    # fp8 dtype for XPU platform
++    "VLLM_XPU_FP8_DTYPE":
++    lambda: os.environ.get("VLLM_XPU_FP8_DTYPE", "e5m2"),
 +
-+template <typename scalar_t>
-+void call_reshape_and_cache_ipexllm_kernel(
-+    const scalar_t* __restrict__ key,
-+    const scalar_t* __restrict__ value,
-+    scalar_t* __restrict__ key_cache,
-+    scalar_t* __restrict__ value_cache,
-+    const int64_t* __restrict__ slot_mapping,
-+    const int num_tokens,
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          reshape_and_cache_ipexllm_kernel<sycl_t>(
-+              (const sycl_t* __restrict__)key,
-+              (const sycl_t* __restrict__)value,
-+              (sycl_t* __restrict__)key_cache,
-+              (sycl_t* __restrict__)value_cache,
-+              slot_mapping,
-+              key_stride,
-+              value_stride,
-+              num_heads,
-+              head_size,
-+              block_size,
-+              x,
-+              item_ct1);
-+        });
-+  });
-+}
++    # Offload model weights to cpu before online fp8 quantization
++    "VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT":
++    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "0") == "1",
+ }
+ 
+ # --8<-- [end:env-vars-definition]
+diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
+index a90a71159..5638da392 100644
+--- a/vllm/model_executor/layers/fused_moe/layer.py
++++ b/vllm/model_executor/layers/fused_moe/layer.py
+@@ -601,7 +601,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+                 logical_replica_count is not None:
+             raise NotImplementedError("Expert load balancing is not supported "
+                                       "for XPU.")
+-        assert custom_routing_function is None
+         return layer.ipex_fusion(
+             x,
+             use_grouped_topk,
+@@ -610,6 +609,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+             renormalize,
+             topk_group,
+             num_expert_group,
++            custom_routing_function=custom_routing_function
+         )
+ 
+     def forward_tpu(
+diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
+index 3d94626e5..72c77e15c 100644
+--- a/vllm/model_executor/layers/quantization/fp8.py
++++ b/vllm/model_executor/layers/quantization/fp8.py
+@@ -309,10 +309,14 @@ class Fp8LinearMethod(LinearMethodBase):
+                         if self.quant_config.is_checkpoint_fp8_serialized else
+                         params_dtype)
+ 
++        # Force offloading weights to cpu if VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
++        # enabled, otherwise use original device config which can be gpu or cpu
++        # (may happen when cpu_offload_gb > 0)
+         weight = ModelWeightParameter(data=torch.empty(
+             output_size_per_partition,
+             input_size_per_partition,
+-            dtype=weight_dtype),
++            dtype=weight_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                       input_dim=1,
+                                       output_dim=0,
+                                       weight_loader=weight_loader)
+@@ -631,8 +635,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+             num_experts,
+             2 * intermediate_size_per_partition,
+             hidden_size,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                         requires_grad=False)
 +
-+void reshape_and_cache_ipexllm(
-+    torch::Tensor& key,
-+    torch::Tensor& value,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    torch::Tensor& slot_mapping,
-+    const std::string& kv_cache_dtype,
-+    const float kv_scale) {
-+  int num_tokens = key.size(0);
-+  int num_heads = key.size(1);
-+  int head_size = key.size(2);
-+  int block_size = key_cache.size(2);
-+  // int x = key_cache.size(4);
-+  int x = 1;
-+
-+  int key_stride = key.stride(0);
-+  int value_stride = value.stride(0);
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel", [&] {
-+        call_reshape_and_cache_ipexllm_kernel<scalar_t>(
-+            key.data_ptr<scalar_t>(),
-+            value.data_ptr<scalar_t>(),
-+            key_cache.data_ptr<scalar_t>(),
-+            value_cache.data_ptr<scalar_t>(),
-+            slot_mapping.data_ptr<int64_t>(),
-+            num_tokens,
-+            key_stride,
-+            value_stride,
-+            num_heads,
-+            head_size,
-+            block_size,
-+            x);
-+      });
-+}
+         layer.register_parameter("w13_weight", w13_weight)
+         set_weight_attrs(w13_weight, extra_weight_attrs)
+ 
+@@ -640,7 +646,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+             num_experts,
+             hidden_size,
+             intermediate_size_per_partition,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                        requires_grad=False)
+         layer.register_parameter("w2_weight", w2_weight)
+         set_weight_attrs(w2_weight, extra_weight_attrs)
+diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
+index 5f9d48142..6364d5cf5 100644
+--- a/vllm/model_executor/layers/quantization/ipex_quant.py
++++ b/vllm/model_executor/layers/quantization/ipex_quant.py
+@@ -9,6 +9,7 @@ from torch.nn import Module
+ from torch.nn.parameter import Parameter
+ 
+ from vllm._ipex_ops import ipex_ops as ops
++import vllm.envs as envs
+ from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
+                                                   FusedMoeWeightScaleSupported)
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+@@ -45,6 +46,7 @@ class IPEXConfig(QuantizationConfig):
+         modules_to_not_convert: Optional[list[str]] = None,
+         desc_act: Optional[bool] = None,
+         lm_head_quantized: Optional[bool] = None,
++        is_qweight_sym: Optional[bool] = None,
+     ) -> None:
+         super().__init__()
+         self.method = method
+@@ -62,6 +64,7 @@ class IPEXConfig(QuantizationConfig):
+         if self.method not in ["awq", "gptq"]:
+             raise ValueError(f"IPEX quantization supports [awq, gptq], "
+                              f"but got {self.method}.")
++        self.is_qweight_sym = is_qweight_sym
+ 
+     def __repr__(self) -> str:
+         return (f"IPEXConfig(method={self.method},"
+@@ -96,16 +99,18 @@ class IPEXConfig(QuantizationConfig):
+                                            ["q_group_size", "group_size"])
+             modules_to_not_convert = cls.get_from_keys_or(
+                 config, ["modules_to_not_convert"], None)
++            is_qweight_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
+             return cls(method, weight_bits, group_size, modules_to_not_convert,
+-                       False, False)
++                       False, False, is_qweight_sym)
+         # otherwise for gptq
+         weight_bits = cls.get_from_keys(config, ["bits"])
+         group_size = cls.get_from_keys(config, ["group_size"])
+         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                  default=False)
+         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
++        is_qweight_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+         return cls(method, weight_bits, group_size, [], desc_act,
+-                   lm_head_quantized)
++                   lm_head_quantized, is_qweight_sym)
+ 
+     @classmethod
+     def override_quantization_method(
+@@ -183,7 +188,8 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
+             g_idx=g_idx,
+             bias=bias,
+             group_size=self.quant_config.group_size,
+-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
++            weight_qscheme="sym" if self.quant_config.is_qweight_sym else "asym",
+         )
+ 
+     def apply(self,
+@@ -249,7 +255,8 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
+             qconfig=qconfig,
+             bias=bias,
+             group_size=self.quant_config.group_size,
+-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],
++            weight_qscheme="sym" if self.quant_config.is_qweight_sym else "asym",
+         )
+ 
+     def apply(self,
+@@ -302,12 +309,12 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
+         layer.num_experts = num_experts
+         layer.orig_dtype = params_dtype
+         layer.weight_block_size = None
+-        # WEIGHTS
+         w13_weight = torch.nn.Parameter(torch.empty(
+             num_experts,
+             2 * intermediate_size_per_partition,
+             hidden_size,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                         requires_grad=False)
+         layer.register_parameter("w13_weight", w13_weight)
+         set_weight_attrs(w13_weight, extra_weight_attrs)
+@@ -316,7 +323,8 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
+             num_experts,
+             hidden_size,
+             intermediate_size_per_partition,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                        requires_grad=False)
+         layer.register_parameter("w2_weight", w2_weight)
+         set_weight_attrs(w2_weight, extra_weight_attrs)
+diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
+index f935bdd84..9a80b80e7 100644
+--- a/vllm/model_executor/layers/quantization/mxfp4.py
++++ b/vllm/model_executor/layers/quantization/mxfp4.py
+@@ -95,6 +95,9 @@ def get_mxfp4_backend():
+         else:
+             logger.info_once("Using Triton backend")
+             return Mxfp4Backend.TRITON
++    elif current_platform.is_xpu():
++        logger.info_once("Using ipex marlin backend on XPU")
++        return Mxfp4Backend.MARLIN
+     elif current_platform.is_rocm() and has_triton_kernels():
+         logger.info_once("Using Triton backend")
+         return Mxfp4Backend.TRITON
+@@ -140,7 +143,10 @@ class Mxfp4Config(QuantizationConfig):
+                 return UnquantizedLinearMethod()
+             raise NotImplementedError("Mxfp4 linear layer is not implemented")
+         elif isinstance(layer, FusedMoE):
+-            return Mxfp4MoEMethod(layer.moe_config)
++            if current_platform.is_xpu():
++                return IpexFp4MoeMethod(layer.moe_config)
++            else:
++                return Mxfp4MoEMethod(layer.moe_config)
+         elif isinstance(layer, Attention):
+             raise NotImplementedError(
+                 "Mxfp4 attention layer is not implemented")
+@@ -165,6 +171,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+     def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                        hidden_size: int, intermediate_size_per_partition: int,
+                        params_dtype: torch.dtype, **extra_weight_attrs):
++        self.original_hidden_size = hidden_size
+         self.num_experts = num_experts
+         weight_dtype = torch.uint8
+         scale_dtype = torch.uint8
+@@ -192,7 +199,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+             #    k = intermediate_size_per_partition_after_pad
+             intermediate_size_per_partition_after_pad = round_up(
+                 intermediate_size_per_partition, 128)
+-            hidden_size = round_up(hidden_size, 256)
++            if current_platform.is_xpu(): 
++                hidden_size = round_up(hidden_size, 128) 
++            else:
++                hidden_size = round_up(hidden_size, 256) 
+ 
+             layer.params_dtype = params_dtype
+             layer.num_experts = num_experts
+@@ -949,3 +959,63 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+             )
+         else:
+             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 +
 +
-+template <typename scalar_t>
-+void copy_blocks_kernel(
-+    int64_t* key_cache_ptrs,
-+    int64_t* value_cache_ptrs,
-+    const int64_t* __restrict__ block_mapping,
-+    const int numel_per_block,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int layer_idx = item_ct1.get_group(2);
-+  const int pair_idx = item_ct1.get_group(1);
-+
-+  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
-+  scalar_t* value_cache =
-+      reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
-+  int64_t src_block_number = block_mapping[2 * pair_idx];
-+  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
-+
-+  const int64_t src_block_offset = src_block_number * numel_per_block;
-+  const int64_t dst_block_offset = dst_block_number * numel_per_block;
-+  for (int i = item_ct1.get_local_id(2); i < numel_per_block;
-+       i += item_ct1.get_local_range(2)) {
-+    int64_t src_offset = src_block_offset + i;
-+    int64_t dst_offset = dst_block_offset + i;
-+    key_cache[dst_offset] = key_cache[src_offset];
-+  }
-+  for (int i = item_ct1.get_local_id(2); i < numel_per_block;
-+       i += item_ct1.get_local_range(2)) {
-+    int64_t src_offset = src_block_offset + i;
-+    int64_t dst_offset = dst_block_offset + i;
-+    value_cache[dst_offset] = value_cache[src_offset];
-+  }
-+}
++class IpexFp4MoeMethod(Mxfp4MoEMethod):
 +
-+template <typename scalar_t>
-+void call_copy_blocks_kernel(
-+    std::vector<torch::Tensor>& key_caches,
-+    std::vector<torch::Tensor>& value_caches,
-+    const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int num_layers = key_caches.size();
-+  TORCH_CHECK(num_layers == value_caches.size());
-+  if (num_layers == 0) {
-+    return;
-+  }
-+  torch::Device cache_device = key_caches[0].device();
-+  TORCH_CHECK(cache_device.is_xpu());
-+  // Create data structures for the kernel.
-+  // Create an array of pointers to the key and value caches.
-+  int64_t key_cache_ptrs[num_layers];
-+  int64_t value_cache_ptrs[num_layers];
-+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-+    key_cache_ptrs[layer_idx] =
-+        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-+    value_cache_ptrs[layer_idx] =
-+        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
-+  }
-+  // Create block mapping array.
-+  std::vector<int64_t> block_mapping_vec;
-+  for (const auto& pair : block_mapping) {
-+    int64_t src_block_number = pair.first;
-+    for (int64_t dst_block_number : pair.second) {
-+      block_mapping_vec.push_back(src_block_number);
-+      block_mapping_vec.push_back(dst_block_number);
-+    }
-+  }
-+  int64_t* block_mapping_array = block_mapping_vec.data();
-+  int num_pairs = block_mapping_vec.size() / 2;
-+  // Move the data structures to the GPU.
-+  // NOTE: This synchronizes the CPU and GPU.
-+  torch::Tensor key_cache_ptrs_tensor =
-+      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
-+          .to(cache_device);
-+  torch::Tensor value_cache_ptrs_tensor =
-+      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
-+          .to(cache_device);
-+  torch::Tensor block_mapping_tensor =
-+      torch::from_blob(block_mapping_array, {2 * num_pairs}, torch::kInt64)
-+          .to(cache_device);
-+  auto k_ptr = key_cache_ptrs_tensor.data_ptr<int64_t>();
-+  auto v_ptr = value_cache_ptrs_tensor.data_ptr<int64_t>();
-+  auto b_ptr = block_mapping_tensor.data_ptr<int64_t>();
-+  // Launch the kernel.
-+  const int numel_per_block = key_caches[0][0].numel();
-+
-+  sycl::range<3> grid(1, num_pairs, num_layers);
-+  sycl::range<3> block(1, 1, std::min(1024, numel_per_block));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          copy_blocks_kernel<sycl_t>(
-+              k_ptr, v_ptr, b_ptr, numel_per_block, item_ct1);
-+        });
-+  });
-+}
++    def __init__(self, moe_config: FusedMoEConfig):
++        super().__init__(moe_config)
++        self.moe_config = moe_config
++        self.alpha = 1.702
++        self.limit = 7.0
 +
-+void copy_blocks(
-+    std::vector<torch::Tensor>& key_caches,
-+    std::vector<torch::Tensor>& value_caches,
-+    const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      key_caches[0].scalar_type(), "call_copy_blocks_kernel", [&] {
-+        call_copy_blocks_kernel<scalar_t>(
-+            key_caches, value_caches, block_mapping);
-+      });
-+}
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        import intel_extension_for_pytorch as ipex
++        layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
++        layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
++        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
++            layer.w13_weight,
++            layer.w2_weight,
++            w1_scale_inv=layer.w13_weight_scale,
++            w2_scale_inv=layer.w2_weight_scale,
++            w13_bias=layer.w13_bias,
++            w2_bias=layer.w2_bias,
++            is_mxfp4=True,
++        )
 +
-+void swap_blocks(
-+    torch::Tensor& src,
-+    torch::Tensor& dst,
-+    const std::map<int64_t, int64_t>& block_mapping) {
-+  char* src_ptr = (char*)src.data_ptr();
-+  char* dst_ptr = (char*)dst.data_ptr();
-+
-+  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+
-+  // NOTE(woosuk): This can be slow if the number of blocks is large.
-+  for (const auto& pair : block_mapping) {
-+    int64_t src_block_number = pair.first;
-+    int64_t dst_block_number = pair.second;
-+    int64_t src_offset = src_block_number * block_size_in_bytes;
-+    int64_t dst_offset = dst_block_number * block_size_in_bytes;
-+    queue.memcpy(
-+        dst_ptr + dst_offset, src_ptr + src_offset, block_size_in_bytes);
-+  }
-+  queue.wait();
-+}
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        global_num_experts: int = -1,
++        expert_map: Optional[torch.Tensor] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        routed_scaling_factor: float = 1.0,
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++        apply_router_weight_on_input: bool = False,
++        activation: str = "silu",
++        enable_eplb: bool = False,
++        expert_load_view: Optional[torch.Tensor] = None,
++        logical_to_physical_map: Optional[torch.Tensor] = None,
++        logical_replica_count: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        hidden_size_pad = round_up(self.original_hidden_size, 128)
++        x_pad = torch.nn.functional.pad(
++            x, (0, hidden_size_pad - x.size(-1)))
++        hidden_states = layer.ipex_fusion(x_pad,
++                                          use_grouped_topk,
++                                          top_k,
++                                          router_logits,
++                                          renormalize,
++                                          topk_group,
++                                          num_expert_group,
++                                          activation="swiglu_oai")
++        hidden_states = hidden_states[..., :self.original_hidden_size].contiguous()
++        return hidden_states
+diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
+index 564f9a5c0..c9653aa9e 100644
+--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
++++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
+@@ -103,6 +103,8 @@ def get_rope(
+                     is_neox_style,
+                     dtype,
+                     mrope_section=rope_scaling["mrope_section"],
++                    mrope_interleaved=rope_scaling.get("mrope_interleaved",
++                                                       False),
+                 )
+             else:
+                 rotary_emb = RotaryEmbedding(
+diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+index 7ac2e4bb6..450d0cee1 100644
+--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
++++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+@@ -138,3 +138,12 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+         offsets: Optional[torch.Tensor] = None,
+     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+         return self.forward_native(positions, query, key, offsets)
 +
-+template <typename scalar_t>
-+void gather_cached_kv_kernel(
-+    scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size]
-+    scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads,
-+                                  // head_size]
-+    const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads,
-+                                            // head_size/x, block_size, x]
-+    const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads,
-+                                              // head_size, block_size]
-+    const int* __restrict__ slot_mapping, // [num_tokens]
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int token_idx = item_ct1.get_group(2);
-+  const int slot_idx = slot_mapping[token_idx];
-+  const int block_idx = slot_idx / block_size;
-+  const int block_offset = slot_idx % block_size;
-+
-+  const int num_tokens = num_heads * head_size;
-+  for (int i = item_ct1.get_local_id(2); i < num_tokens;
-+       i += item_ct1.get_local_range(2)) {
-+    const int tgt_key_idx = token_idx * key_stride + i;
-+    const int tgt_value_idx = token_idx * value_stride + i;
-+
-+    const int head_idx = i / head_size;
-+    const int head_offset = i % head_size;
-+    const int x_idx =
-+        head_offset / x; // the offset of the [head_size/x] dimension
-+    const int x_offset = head_offset % x;
-+
-+    // const int src_key_idx =
-+    //     block_idx * num_heads * (head_size / x) * block_size * x +
-+    //     head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-+    //     block_offset * x + x_offset;
-+    // const int src_value_idx = block_idx * num_heads * head_size * block_size +
-+    //     head_idx * head_size * block_size + head_offset * block_size +
-+    //     block_offset;
-+
-+    const int src_value_idx = 
-+        block_idx * num_heads * head_size * block_size + 
-+        head_idx * head_size * block_size + 
-+        block_offset * head_size + 
-+        head_offset;
-+    const int src_key_idx = src_value_idx;
-+
-+    key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]);
-+    value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]);
-+  }
-+}
++    def forward_xpu(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: Optional[torch.Tensor] = None,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
++        return self.forward_native(positions, query, key, offsets)
+diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
+index 0acb5ea74..c4b8c66eb 100644
+--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
++++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
+@@ -177,6 +177,18 @@ def triton_mrope(
+     return q, k
+ 
+ 
++def apply_interleaved_rope(x: torch.Tensor,
++                           mrope_section: list[int]) -> torch.Tensor:
++    """Apply interleaved MRoPE to 3D rotary embeddings.
++    Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
++    interleaved [THTHWHTHW...TT], preserving frequency continuity.
++    """
++    x_t = x[0].clone()
++    x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3]
++    x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3]
++    return x_t
 +
-+template <typename scalar_t>
-+void gather_cached_kv_kernel_optimized(
-+    scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size]
-+    scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads,
-+                                  // head_size]
-+    const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads,
-+                                            // head_size/x, block_size, x]
-+    const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads,
-+                                              // head_size, block_size]
-+    const int* __restrict__ slot_mapping, // [num_tokens]
-+    const int key_stride,
-+    const int value_stride,
-+    const int num_heads,
-+    const int head_size,
-+    const int block_size,
-+    const int x,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int token_idx = item_ct1.get_group(2);
-+  const int slot_idx = slot_mapping[token_idx];
-+  const int block_idx = slot_idx / block_size;
-+  const int block_offset = slot_idx % block_size;
-+
-+  const int dim = num_heads * head_size;
-+  assert(dim % 4 == 0); // this is true for known use cases
-+  const int unroll_factor = 4;
-+  const int unrolled_dim = dim / unroll_factor;
-+
-+  for (int i = item_ct1.get_local_id(2); i < unrolled_dim;
-+       i += item_ct1.get_local_range(2)) {
-+    int tgt_key_indices[unroll_factor];
-+    int tgt_value_indices[unroll_factor];
-+    int src_key_indices[unroll_factor];
-+    int src_value_indices[unroll_factor];
-+    scalar_t keys_to_store[unroll_factor];
-+    scalar_t values_to_store[unroll_factor];
-+
-+#pragma unroll
-+    for (int j = 0; j < unroll_factor; ++j) {
-+      int index = i + j * unrolled_dim;
-+
-+      const int tgt_key_idx = token_idx * key_stride + index;
-+      const int tgt_value_idx = token_idx * value_stride + index;
-+
-+      const int head_idx = index / head_size;
-+      const int head_offset = index % head_size;
-+
-+      const int src_value_idx = 
-+        block_idx * num_heads * head_size * block_size + 
-+        head_idx * head_size * block_size + 
-+        block_offset * head_size + 
-+        head_offset;
-+      const int src_key_idx = src_value_idx;
-+
-+      tgt_key_indices[j] = tgt_key_idx;
-+      tgt_value_indices[j] = tgt_value_idx;
-+      src_key_indices[j] = src_key_idx;
-+      src_value_indices[j] = src_value_idx;
-+
-+      keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]);
-+      values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]);
-+    }
 +
-+#pragma unroll
-+    for (int j = 0; j < unroll_factor; ++j) {
-+      key[tgt_key_indices[j]] = keys_to_store[j];
-+      value[tgt_value_indices[j]] = values_to_store[j];
-+    }
-+  }
-+}
+ class MRotaryEmbedding(RotaryEmbedding):
+     """Rotary Embedding with Multimodal Sections."""
+ 
+@@ -189,6 +201,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+         is_neox_style: bool,
+         dtype: torch.dtype,
+         mrope_section: Optional[list[int]] = None,
++        mrope_interleaved: Optional[bool] = False,
+     ) -> None:
+         # In Qwen2.5-VL, the maximum index value is related to the duration of
+         # the input video. We enlarge max_position_embeddings to 4 times to get
+@@ -198,6 +211,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+                          base, is_neox_style, dtype)
+ 
+         self.mrope_section = mrope_section
++        self.mrope_interleaved = mrope_interleaved
+         if self.mrope_section:
+             assert sum(self.mrope_section) == rotary_dim // 2
+ 
+@@ -225,17 +239,20 @@ class MRotaryEmbedding(RotaryEmbedding):
+         cos, sin = cos_sin.chunk(2, dim=-1)
+         if positions.ndim == 2:
+             assert self.mrope_section
+-
+-            cos = torch.cat([
+-                m[i]
+-                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
+-            ],
+-                            dim=-1)
+-            sin = torch.cat([
+-                m[i]
+-                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
+-            ],
+-                            dim=-1)
++            if self.mrope_interleaved:
++                cos = apply_interleaved_rope(cos, self.mrope_section)
++                sin = apply_interleaved_rope(sin, self.mrope_section)
++            else:
++                cos = torch.cat([
++                    m[i] for i, m in enumerate(
++                        cos.split(self.mrope_section, dim=-1))
++                ],
++                                dim=-1)
++                sin = torch.cat([
++                    m[i] for i, m in enumerate(
++                        sin.split(self.mrope_section, dim=-1))
++                ],
++                                dim=-1)
+ 
+         query_shape = query.shape
+         query = query.view(num_tokens, -1, self.head_size)
+@@ -265,6 +282,10 @@ class MRotaryEmbedding(RotaryEmbedding):
+         assert positions.ndim == 1 or positions.ndim == 2
+         assert key is not None
+ 
++        if self.mrope_interleaved:
++            # TODO: add triton implementation to support mrope-interleaved
++            return self.forward_native(positions, query, key)
 +
-+template <typename scalar_t>
-+void call_gather_cached_kv_kernel_optimized(
-+    torch::Tensor& key,
-+    torch::Tensor& value,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    torch::Tensor& slot_mapping) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int num_tokens = key.size(0);
-+  int num_heads = key.size(1);
-+  int head_size = key.size(2);
-+  int block_size = key_cache.size(2);
-+  // int x = key_cache.size(4);
-+  int x = 1;
-+
-+  int key_stride = key.stride(0);
-+  int value_stride = value.stride(0);
-+  auto key_ptr = key.data_ptr<scalar_t>();
-+  auto value_ptr = value.data_ptr<scalar_t>();
-+  auto key_cache_ptr = key_cache.data_ptr<scalar_t>();
-+  auto value_cache_ptr = value_cache.data_ptr<scalar_t>();
-+  auto slot_mapping_ptr = slot_mapping.data_ptr<int>();
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          gather_cached_kv_kernel_optimized<sycl_t>(
-+              (sycl_t* __restrict__)key_ptr,
-+              (sycl_t* __restrict__)value_ptr,
-+              (const sycl_t* __restrict__)key_cache_ptr,
-+              (const sycl_t* __restrict__)value_cache_ptr,
-+              slot_mapping_ptr,
-+              key_stride,
-+              value_stride,
-+              num_heads,
-+              head_size,
-+              block_size,
-+              x,
-+              item_ct1);
-+        });
-+  });
-+}
+         num_tokens = positions.shape[-1]
+         cos_sin = self.cos_sin_cache[positions]
+         cos, sin = cos_sin.chunk(2, dim=-1)
+@@ -300,6 +321,15 @@ class MRotaryEmbedding(RotaryEmbedding):
+         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+         return query, key
+ 
++    def forward_xpu(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: Optional[torch.Tensor] = None,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
++        return self.forward_native(positions, query, key, offsets)
 +
-+void gather_cached_kv(
-+    torch::Tensor& key,
-+    torch::Tensor& value,
-+    torch::Tensor& key_cache,
-+    torch::Tensor& value_cache,
-+    torch::Tensor& slot_mapping) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      key_cache[0].scalar_type(),
-+      "call_gather_cached_kv_kernel_optimized",
-+      [&] {
-+        call_gather_cached_kv_kernel_optimized<scalar_t>(
-+            key, value, key_cache, value_cache, slot_mapping);
-+      });
-+}
-diff --git a/csrc/xpu/cache_ops_xpu_fp8.cpp b/csrc/xpu/cache_ops_xpu_fp8.cpp
-new file mode 100644
-index 000000000..e4a0001fe
---- /dev/null
-+++ b/csrc/xpu/cache_ops_xpu_fp8.cpp
-@@ -0,0 +1,170 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+#include <ext/intel/esimd.hpp>
-+// clang-format on
-+#include "xpu_types.h"
-+
-+#include <torch/extension.h>
-+#include "utils.h"
-+#include "kv.h"
-+
-+using fp16 = sycl::half;
-+using namespace sycl::ext::intel::esimd;
-+
-+// scalar_t is key.scalar_type() -> half
-+template <typename scalar_t, const int HD>
-+void reshape_and_cache_ipexllm_kernel_fp8(
-+    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
-+    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-+    uint8_t * __restrict__ key_cache,  // [num_blocks, num_kv_heads, block_size,
-+                                       // head_size]
-+    uint8_t * __restrict__ value_cache,        // [num_blocks, num_kv_heads,
-+                                               // block_size, head_size]
-+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-+    const int key_stride, const int value_stride,
-+    const int key_head_stride, const int value_head_stride,
-+    const int num_heads,
-+    const int head_size, const int block_size, const int x,
-+    const sycl::nd_item<3>& item_ct1) {
-+
-+  //                      New Implementation                      //
-+  const size_t token_idx = item_ct1.get_global_id(0);
-+  const size_t head_idx = item_ct1.get_global_id(1);
-+  const int64_t slot_idx = slot_mapping[token_idx];
-+  if (slot_idx < 0) {
-+    return;
-+  }
-+  const int64_t block_idx = slot_idx / block_size;
-+  const int64_t block_offset = slot_idx % block_size;
-+  // The thread is responsible for the HD elements within key/value
-+  const scalar_t * key_head = key + token_idx * key_stride + head_idx * key_head_stride;
-+
-+  const scalar_t * value_head = value + token_idx * value_stride + head_idx * value_head_stride;
-+
-+  uint8_t * key_output_head = key_cache + block_idx * num_heads * head_size * block_size +
-+      head_idx * head_size * block_size + block_offset * head_size;
-+  uint8_t * value_output_head = value_cache + block_idx * num_heads * head_size * block_size +
-+      head_idx * head_size * block_size + block_offset * head_size;
-+
-+  simd<fp16, HD> key_row = block_load<scalar_t, HD>(key_head);
-+  simd<uint8_t, HD> key_result = quantize_key_row<HD>(key_row);
-+  block_store<uint8_t, HD>(key_output_head, key_result);
-+
-+  simd<fp16, HD> value_row = block_load<scalar_t, HD>(value_head);
-+  simd<uint8_t, HD> value_result = quantize_value_row<HD>(value_row);
-+  block_store<uint8_t, HD>(value_output_head, value_result);
-+}
-+
-+
-+template <typename scalar_t, const int HD>
-+void call_reshape_and_cache_ipexllm_kernel_fp8(
-+    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-+    uint8_t* __restrict__ key_cache, uint8_t* __restrict__ value_cache,
-+    const int64_t* __restrict__ slot_mapping, const int num_tokens,
-+    const int key_stride, const int value_stride,
-+    const int key_head_stride, const int value_head_stride,
-+    const int num_heads,
-+    const int head_size, const int block_size, const int x) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(num_tokens, num_heads, 1);
-+  sycl::range<3> block(1, 1, 1);
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) SYCL_ESIMD_KERNEL {
-+          reshape_and_cache_ipexllm_kernel_fp8<sycl_t, HD>(
-+              (const sycl_t* __restrict__)key,
-+              (const sycl_t* __restrict__)value,
-+              (uint8_t* __restrict__)key_cache,
-+              (uint8_t* __restrict__)value_cache, slot_mapping, key_stride,
-+              value_stride, key_head_stride, value_head_stride,
-+              num_heads, head_size, block_size, x, item_ct1);
-+        });
-+  });
-+}
-+
-+void reshape_and_cache_ipexllm_fp8(torch::Tensor& key, torch::Tensor& value,
-+                               torch::Tensor& key_cache,
-+                               torch::Tensor& value_cache,
-+                               torch::Tensor& slot_mapping,
-+                               const std::string& kv_cache_dtype,
-+                               const float kv_scale) {
-+  int num_tokens = key.size(0);
-+  int num_heads = key.size(1);
-+  int head_size = key.size(2);
-+  int block_size = key_cache.size(2);
-+  // int x = key_cache.size(4);
-+  int x = 1;
-+
-+  int key_stride = key.stride(0);
-+  int value_stride = value.stride(0);
-+
-+  int key_head_stride = key.stride(1);
-+  int value_head_stride = value.stride(1);
-+
-+  // This actually dispatches on scalar_type, we will then need to dispatch on Head Dim...
-+switch (head_size) {
-+  case 64:
-+    VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
-+          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 64>(
-+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-+              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
-+              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-+              value_stride, key_head_stride, value_head_stride, num_heads,
-+              head_size, block_size, x);
-+        });
-+    break;
-+  case 128:
-+    VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
-+          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 128>(
-+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-+              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
-+              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-+              value_stride, key_head_stride, value_head_stride, num_heads,
-+              head_size, block_size, x);
-+        });
-+    break;
-+  case 96:
-+    VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
-+          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 96>(
-+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-+              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
-+              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-+              value_stride, key_head_stride, value_head_stride, num_heads,
-+              head_size, block_size, x);
-+        });
-+    break;
-+  case 80:
-+    VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
-+          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 80>(
-+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-+              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
-+              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-+              value_stride, key_head_stride, value_head_stride, num_heads,
-+              head_size, block_size, x);
-+        });
-+    break;
-+  default:
-+    TORCH_CHECK(false, "Unsupported head_dim: ", head_size);
-+}
-+  // VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+  //     key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
-+  //       call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 128>(
-+  //           key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-+  //           key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
-+  //           slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-+  //           value_stride, key_head_stride, value_head_stride,
-+  //           num_heads, head_size, block_size, x);
-+  //     });
-+}
+     @classmethod
+     def get_input_positions(
+         cls,
+@@ -370,6 +400,15 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 context_len=context_len,
+                 seq_len=seq_len,
+             )
++        elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]:
++            return cls._qwen3vl_get_input_positions_tensor(
++                input_tokens=input_tokens,
++                hf_config=hf_config,
++                image_grid_thw=image_grid_thw,
++                video_grid_thw=video_grid_thw,
++                context_len=context_len,
++                seq_len=seq_len,
++            )
+         elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
+             return cls._ernie_get_input_positions_tensor(
+                 input_tokens=input_tokens,
+@@ -508,6 +547,98 @@ class MRotaryEmbedding(RotaryEmbedding):
+                                 len(input_tokens)).item()
+         return llm_positions, mrope_position_delta
+ 
++    @classmethod
++    def _qwen3vl_get_input_positions_tensor(
++        cls,
++        input_tokens: list[int],
++        hf_config: PretrainedConfig,
++        image_grid_thw: Union[list[list[int]], torch.Tensor],
++        video_grid_thw: Union[list[list[int]], torch.Tensor],
++        context_len: int = 0,
++        seq_len: Optional[int] = None,
++    ) -> tuple[torch.Tensor, int]:
++        """Get mrope input positions and delta value."""
++
++        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw
++                          for _ in range(t)]
++
++        image_token_id = hf_config.image_token_id
++        video_token_id = hf_config.video_token_id
++        vision_start_token_id = hf_config.vision_start_token_id
++        spatial_merge_size = hf_config.vision_config.spatial_merge_size
++
++        input_tokens_tensor = torch.tensor(input_tokens)
++        vision_start_indices = torch.argwhere(
++            input_tokens_tensor == vision_start_token_id).squeeze(1)
++        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
++        image_nums = (vision_tokens == image_token_id).sum()
++        video_nums = (vision_tokens == video_token_id).sum()
++        llm_pos_ids_list: list = []
++
++        st = 0
++        remain_images, remain_videos = image_nums, video_nums
++
++        image_index, video_index = 0, 0
++        for _ in range(image_nums + video_nums):
++            if image_token_id in input_tokens and remain_images > 0:
++                ed_image = input_tokens.index(image_token_id, st)
++            else:
++                ed_image = len(input_tokens) + 1
++            if video_token_id in input_tokens and remain_videos > 0:
++                ed_video = input_tokens.index(video_token_id, st)
++            else:
++                ed_video = len(input_tokens) + 1
++            if ed_image < ed_video:
++                t, h, w = (
++                    image_grid_thw[image_index][0],
++                    image_grid_thw[image_index][1],
++                    image_grid_thw[image_index][2],
++                )
++                image_index += 1
++                remain_images -= 1
++                ed = ed_image
++            else:
++                t, h, w = (
++                    video_grid_thw[video_index][0],
++                    video_grid_thw[video_index][1],
++                    video_grid_thw[video_index][2],
++                )
++                video_index += 1
++                remain_videos -= 1
++                ed = ed_video
++
++            llm_grid_t, llm_grid_h, llm_grid_w = \
++                t, h // spatial_merge_size, w // spatial_merge_size
++            text_len = ed - st
++
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
++                -1, llm_grid_h * llm_grid_w).flatten()
++            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
++                llm_grid_t, -1, llm_grid_w).flatten()
++            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
++                llm_grid_t, llm_grid_h, -1).flatten()
++            llm_pos_ids_list.append(
++                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
++            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
++
++        if st < len(input_tokens):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            text_len = len(input_tokens) - st
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++        mrope_position_delta = (llm_positions.max() + 1 -
++                                len(input_tokens)).item()
++        llm_positions = llm_positions[:, context_len:seq_len]
++        return llm_positions, mrope_position_delta
 +
+     @classmethod
+     def _ernie_get_input_positions_tensor(
+         cls,
+@@ -715,15 +846,23 @@ class MRotaryEmbedding(RotaryEmbedding):
+             st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                 llm_pos_ids_list) > 0 else 0
+             llm_pos_ids_list.append(
+-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+-
+-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+-                -1, llm_grid_h * llm_grid_w)).long().flatten()
+-
+-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+-                llm_grid_t, -1, llm_grid_w).flatten()
+-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+-                llm_grid_t, llm_grid_h, -1).flatten()
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
++            )
++            t_index = (
++                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
++            ).flatten()
++            h_index = (
++                torch.arange(llm_grid_h)
++                .view(1, -1, 1)
++                .expand(llm_grid_t, -1, llm_grid_w)
++                .flatten()
++            )
++            w_index = (
++                torch.arange(llm_grid_w)
++                .view(1, 1, -1)
++                .expand(llm_grid_t, llm_grid_h, -1)
++                .flatten()
++            )
+             llm_pos_ids_list.append(
+                 torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+@@ -772,7 +911,6 @@ class MRotaryEmbedding(RotaryEmbedding):
+ 
+         st = 0
+         remain_images, remain_videos = image_nums, video_nums
+-
+         image_index, video_index = 0, 0
+         for _ in range(image_nums + video_nums):
+             video_second_per_grid_t = 0.0
+@@ -819,16 +957,25 @@ class MRotaryEmbedding(RotaryEmbedding):
+             st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                 llm_pos_ids_list) > 0 else 0
+             llm_pos_ids_list.append(
+-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+-
+-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+-                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+-                       tokens_per_second).long().flatten()
+-
+-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+-                llm_grid_t, -1, llm_grid_w).flatten()
+-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+-                llm_grid_t, llm_grid_h, -1).flatten()
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
++            )
++            t_index = (
++                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
++                * video_second_per_grid_t
++                * tokens_per_second
++            ).flatten()
++            h_index = (
++                torch.arange(llm_grid_h)
++                .view(1, -1, 1)
++                .expand(llm_grid_t, -1, llm_grid_w)
++                .flatten()
++            )
++            w_index = (
++                torch.arange(llm_grid_w)
++                .view(1, 1, -1)
++                .expand(llm_grid_t, llm_grid_h, -1)
++                .flatten()
++            )
+             llm_pos_ids_list.append(
+                 torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+@@ -847,6 +994,339 @@ class MRotaryEmbedding(RotaryEmbedding):
+ 
+         return llm_positions, mrope_position_delta
+ 
++    @classmethod
++    def _omni3_get_input_positions_tensor(
++        cls,
++        config,
++        input_ids: torch.Tensor,
++        image_grid_thw: torch.Tensor,
++        video_grid_thw: torch.Tensor,
++        use_audio_in_video: bool = False,
++        audio_seqlens: Optional[torch.Tensor] = None,
++        second_per_grids: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, torch.Tensor]:
++        def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
++            input_lengths_leave = input_lengths % 100
++            feat_lengths = (input_lengths_leave - 1) // 2 + 1
++            output_lengths = (
++                ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++            )
++            return output_lengths
 +
++        if input_ids is None or input_ids.ndim != 1:
++            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
 +
-diff --git a/csrc/xpu/common.h b/csrc/xpu/common.h
-new file mode 100644
-index 000000000..17d6ef643
---- /dev/null
-+++ b/csrc/xpu/common.h
-@@ -0,0 +1,312 @@
-+#pragma once
-+
-+#include <sycl.hpp>
-+#include <torch/extension.h>
-+
-+typedef union half_t {
-+    uint16_t u;
-+    sycl::half f;
-+} __half_t;
-+
-+typedef union ufloat32 {
-+    unsigned u;
-+    float f;
-+} __float_t;
-+
-+#define QK4_0 64
-+#define QR4_0 2
-+#define QK4_1 64
-+#define QR4_1 2
-+#define QK5_0 64
-+#define QR5_0 2
-+#define QK5_1 64
-+#define QR5_1 2
-+#define QK8_0 64
-+#define QR8_0 1
-+#define QK8_1 32
-+#define QR8_1 1
-+#define QI8_1 (QK8_1 / (4 * QR8_1)) // 8
-+#define QKFP8 64
-+#define QRFP8 1
-+#define QKFP6 64
-+// for iq2 quantization
-+#define WARP_SIZE 32
-+#define QK_K 256
-+#define QK4_K 32
-+#define QR4_K 2
-+#define QK6_K 16
-+#define QKFP6_K 16
-+#define QR2_XXS 8
-+#define QI2_XXS (QK_K / (4*QR2_XXS)) // 8
-+#define QR2_XS 8
-+#define QI2_XS (QK_K / (4*QR2_XS)) // 8
-+#define QR2_K 4
-+#define QI2_K (QK_K / (4*QR2_K)) // 16
-+#define QR1_S 8
-+#define QI1_S (QK_K / (4*QR1_S)) // 8
-+
-+typedef struct {
-+    sycl::half d;          // delta
-+    uint8_t qs[QK4_0 / 2];    // nibbles / quants
-+} block_q4_0;
-+
-+typedef struct {
-+    uint8_t qs[QK4_0 / 2];    // nibbles / quants
-+} block_q4_0_qs;
-+
-+typedef struct {
-+    uint8_t qs[QK4_1 / 2];    // nibbles / quants
-+} block_q4_1_qs;
-+
-+typedef struct {
-+    sycl::half d;              // delta
-+    sycl::half m;              // min
-+    uint8_t qs[QK4_1 / 2];     // nibbles / quants
-+} block_q4_1;
-+
-+typedef struct {
-+    sycl::half d;
-+    uint8_t qh[8];
-+    uint8_t qs[QK5_0 / 2];
-+} block_q5_0;
-+
-+typedef struct {
-+    sycl::half d;          // delta
-+    sycl::half m;          // min
-+    uint8_t qh[8];         // 5-th bit of quants
-+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-+} block_q5_1;
-+
-+typedef struct {
-+    sycl::half d;           // delta
-+    uint8_t qh[8];          // 3-th bit of quants
-+    uint8_t qs[QK4_0 / 4];  // nibbles / quants
-+} block_nf3;
-+
-+typedef struct {
-+    uint8_t qh[8];          // 3-th bit of quants
-+    uint8_t qs[QK4_0 / 4];  // nibbles / quants
-+} block_nf3_qs;
-+
-+typedef struct {
-+    float d;       // delta
-+    int8_t qs[QK8_0];   // quants
-+} block_q8_0;
-+
-+typedef struct {
-+    int8_t qs[QK8_0];   // quants
-+} block_q8_0_qs;
-+
-+typedef struct {
-+    sycl::half d;
-+    sycl::half sum;
-+    int8_t  qs[QK8_1];      // quants
-+} block_q8_1;
-+
-+typedef struct {
-+    uint8_t qs[QKFP8];
-+} block_fp8_qs;
-+
-+typedef struct {
-+    float d;
-+    uint8_t qs[QKFP8];
-+} block_fp8;
-+
-+typedef struct {
-+    sycl::half d;
-+    uint16_t qs[QK_K/8]; // 32
-+} block_iq2_xxs;
-+
-+typedef struct {
-+    sycl::half d;
-+    uint16_t qs[QK_K/8]; // 32
-+    uint8_t  scales[QK_K/32]; // 8
-+} block_iq2_xs;
-+
-+typedef struct {
-+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-+    uint8_t qs[QK_K/4];      // quants
-+    sycl::half d;            // super-block scale for quantized scales
-+    sycl::half min;          // super-block min for quantized mins
-+} block_q2_K;
-+
-+typedef struct {
-+    sycl::half d;                 // super-block scale for quantized scales
-+    sycl::half dmin;              // super-block scale for quantized mins
-+    uint8_t scales[16];           // scales and mins, quantized with 8 bits
-+    uint8_t qs[QK_K/2];           // 4--bit quants
-+} block_q4_K;
-+
-+typedef struct {
-+    uint8_t qs[QK_K/2];            // 4-bit quants
-+} block_q4_K_qs;
-+
-+typedef struct {
-+    uint8_t qs[QK4_K/2];            // 4-bit quants
-+} block_q4_K_qs_block;
-+
-+typedef struct {
-+    uint8_t scales[16];            // scales and mins, quantized with 8 bits
-+} block_q4_K_scales;
-+
-+typedef struct {
-+    sycl::half d;               // super-block scale for quantized scales
-+    sycl::half dmin;            // super-block scale for quantized mins
-+    uint8_t scales[12];         // scales and mins, quantized with 6 bits
-+    uint8_t qh[QK_K/8];          // quants, high bit
-+    uint8_t qs[QK_K/2];          // quants, low 4 bits
-+} block_q5_K;
-+
-+typedef struct {
-+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-+    int8_t  scales[QK_K/16]; // scales
-+    sycl::half d;            // delta
-+} block_q6_K;
-+
-+typedef struct {
-+    uint32_t qh[QK_K/16];      // quants, upper 2 bits
-+} block_q6_K_qh;
-+
-+typedef struct {
-+    uint32_t ql[QK_K/8];      // quants, lower 4 bits
-+} block_q6_K_ql;
-+
-+typedef struct {
-+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-+} block_q6_K_scales;
-+
-+typedef struct {
-+    uint8_t ql[QK_K/2];       // quants, lower 4 bits
-+    uint8_t qh[QK_K/4];       // quants, upper 2 bits
-+    int8_t  scales[QK_K/16];  // scales, quantized with 8 bits
-+    sycl::half d;            // super-block scale
-+} block_fp6_K;
-+static_assert(sizeof(block_fp6_K) == sizeof(sycl::half) + QK_K / 16 + 3*QK_K/4, "wrong fp6_K block size/padding");
-+
-+typedef struct {
-+    uint32_t ql[QK_K/8];      // quants, lower 4 bits
-+} block_fp6_k_ql;
-+
-+typedef struct {
-+    uint32_t qh[QK_K/16];     // quants, upper 2 bits
-+} block_fp6_k_qh;
-+
-+typedef struct {
-+    int8_t scales[QK_K/16];  // scales, quantized with 8 bits, 16
-+} block_fp6_k_scales;
-+
-+typedef struct {
-+    uint32_t ql[QKFP6_K/8];     // upper 2 bits, 2
-+} block_base_fp6_k_ql;
-+
-+typedef struct {
-+    uint32_t qh[QKFP6_K/16];     // upper 2 bits, 1
-+} block_base_fp6_k_qh;
-+
-+#define NGRID_IQ1S 2048
-+#define IQ1S_DELTA 0.125f
-+#define IQ1M_DELTA 0.125f
-+
-+typedef struct {
-+    sycl::half d;
-+    uint8_t  qs[QK_K/8];
-+    uint16_t qh[QK_K/32];
-+} block_iq1_s;
-+
-+// 1.8125 bpw
-+typedef struct {
-+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
-+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
-+    uint8_t  scales[QK_K/32]; // 4-bit block scales
-+} block_iq1_m;
-+
-+typedef struct {
-+    uint8_t ql[QKFP6/2];      // lower 4 bits, 32
-+    uint8_t qh[QKFP6/4];      // upper 2 bits, 16
-+    sycl::half  d;            // delta
-+} block_fp6;
-+
-+typedef struct {
-+    uint32_t qh[QKFP6/16];     // upper 2 bits, 4
-+} block_fp6_32_qh;
-+
-+typedef struct {
-+    uint32_t ql[QKFP6/8];      // lower 4 bits, 8
-+} block_fp6_32_ql;
-+
-+enum ggml_type {
-+    GGML_TYPE_Q4_0 = 2,
-+    GGML_TYPE_Q4_1 = 3,
-+    GGML_TYPE_Q5_0 = 6,
-+    GGML_TYPE_Q5_1 = 7,
-+    GGML_TYPE_Q8_0 = 8,
-+    GGML_TYPE_Q8_1 = 9,
-+    GGML_TYPE_NF4 = 10,
-+    GGML_TYPE_NF3 = 11,
-+    GGML_TYPE_FP8E4 = 15,
-+    GGML_TYPE_FP4 = 16,
-+    GGML_TYPE_FP8E5 = 19,
-+    GGML_TYPE_IQ2_XXS = 21,
-+    GGML_TYPE_IQ2_XS = 22,
-+    GGML_TYPE_Q2_K = 23,
-+    GGML_TYPE_IQ1_S = 24,
-+    GGML_TYPE_IQ1_M = 25,
-+    GGML_TYPE_Q6_K = 26,
-+    GGML_TYPE_Q4_K = 27,
-+    GGML_TYPE_Q5_K = 28,
-+    GGML_TYPE_FP6 = 29,
-+    GGML_TYPE_FP6_K = 30,
-+    GGML_TYPE_Q4_0_WOQ = 34,
-+    GGML_TYPE_COUNT
-+};
-+
-+static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-+    [GGML_TYPE_Q4_0] = QK4_0,
-+    [GGML_TYPE_Q4_1] = QK4_1,
-+    [GGML_TYPE_Q5_0] = QK5_0,
-+    [GGML_TYPE_Q5_1] = QK5_1,
-+    [GGML_TYPE_NF4]  = QK4_0,
-+    [GGML_TYPE_NF3]  = QK4_0,
-+    [GGML_TYPE_Q8_0] = QK8_0,
-+    [GGML_TYPE_Q8_1] = QK8_1,
-+    [GGML_TYPE_FP8E4]  = QKFP8,
-+    [GGML_TYPE_FP4]  = QK4_0,
-+    [GGML_TYPE_FP6]  = QKFP6,
-+    [GGML_TYPE_FP8E5]  = QKFP8,
-+    [GGML_TYPE_IQ2_XXS] = QK_K,
-+    [GGML_TYPE_IQ2_XS] = QK_K,
-+    [GGML_TYPE_Q2_K] = QK_K,
-+    [GGML_TYPE_IQ1_S] = QK_K,
-+    [GGML_TYPE_IQ1_M] = QK_K,
-+    [GGML_TYPE_Q6_K] = QK_K,
-+    [GGML_TYPE_Q4_K] = QK_K,
-+    [GGML_TYPE_Q5_K] = QK_K,
-+    [GGML_TYPE_FP6_K] = QK_K,
-+    [GGML_TYPE_Q4_0_WOQ] = QK4_0,
-+};
-+
-+static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-+    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
-+    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-+    [GGML_TYPE_Q5_0] = sizeof(block_q5_1),
-+    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
-+    [GGML_TYPE_NF4]  = sizeof(block_q4_0),
-+    [GGML_TYPE_NF3]  = sizeof(block_nf3),
-+    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
-+    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-+    [GGML_TYPE_FP8E4]= sizeof(block_fp8),
-+    [GGML_TYPE_FP4]  = sizeof(block_q4_0),
-+    [GGML_TYPE_FP6]  = sizeof(block_fp6),
-+    [GGML_TYPE_FP8E5]  = sizeof(block_fp8),
-+    [GGML_TYPE_IQ2_XXS] = sizeof(block_iq2_xxs),
-+    [GGML_TYPE_IQ2_XS] = sizeof(block_iq2_xs),
-+    [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
-+    [GGML_TYPE_IQ1_S] = sizeof(block_iq1_s),
-+    [GGML_TYPE_IQ1_M] = sizeof(block_iq1_m),
-+    [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
-+    [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
-+    [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
-+    [GGML_TYPE_FP6_K] = sizeof(block_fp6_K),
-+    [GGML_TYPE_Q4_0_WOQ] = sizeof(block_q4_0),
-+};
-diff --git a/csrc/xpu/dequantize.h b/csrc/xpu/dequantize.h
-new file mode 100644
-index 000000000..9a967312e
---- /dev/null
-+++ b/csrc/xpu/dequantize.h
-@@ -0,0 +1,74 @@
-+#include <dpct/dpct.hpp>
-+#include <sycl/sycl.hpp>
-+#include "utils.h"
-+/*
-+Adapted from https://github.com/mit-han-lab/llm-awq
-+Modified from NVIDIA FasterTransformer:
-+https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-+@article{lin2023awq,
-+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
-+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
-+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
-+}
-+*/
-+
-+#pragma once
-+
-+namespace vllm {
-+namespace awq {
-+
-+sycl::uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
-+  sycl::uint4 result;
-+
-+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
-+  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
-+
-+  // First, we extract the i4s and construct an intermediate fp16 number.
-+  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
-+  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
-+  static constexpr uint32_t TOP_MASK = 0x00f000f0;
-+  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
-+
-+  // Note that the entire sequence only requires 1 shift instruction. This is
-+  // thanks to the register packing format and the fact that we force our
-+  // integers to be unsigned, and account for this in the fp16 subtractions. In
-+  // addition, I exploit the fact that sub and fma have the same throughput in
-+  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
-+  // the bottom bits before hand.
-+
-+  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
-+  // dependency if we issue immediately before required.
-+  const uint32_t top_i4s = i4s >> 8;
-+  h[0] = (i4s & BOTTOM_MASK) | I4s_TO_F16s_MAGIC_NUM;
-+  h[1] = (i4s & TOP_MASK) | I4s_TO_F16s_MAGIC_NUM;
-+  h[2] = (top_i4s & BOTTOM_MASK) | I4s_TO_F16s_MAGIC_NUM;
-+  h[3] = (top_i4s & TOP_MASK) | I4s_TO_F16s_MAGIC_NUM;
-+
-+  // This is the half2 {1032, 1032} represented as an integer.
-+  // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
-+  // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
-+  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
-+  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
-+  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
-+  // This is the half2 {-72, -72} represented as an integer.
-+  // static constexpr uint32_t NEG_72 = 0xd480d480;
-+  // Haotian: Let's use {-64, -64}.
-+  static constexpr uint32_t NEG_64 = 0xd400d400;
-+  *(sycl::half2*)(&h[0]) = sycl_half_sub2(
-+      *(sycl::half2*)(&h[0]), *(sycl::half2*)(&FP16_TOP_MAGIC_NUM));
-+  *(sycl::half2*)(&h[1]) = sycl_half_fma2(
-+      *(sycl::half2*)(&h[1]),
-+      *(sycl::half2*)(&ONE_SIXTEENTH),
-+      *(sycl::half2*)(&NEG_64));
-+  *(sycl::half2*)(&h[2]) = sycl_half_sub2(
-+      *(sycl::half2*)(&h[2]), *(sycl::half2*)(&FP16_TOP_MAGIC_NUM));
-+  *(sycl::half2*)(&h[3]) = sycl_half_fma2(
-+      *(sycl::half2*)(&h[3]),
-+      *(sycl::half2*)(&ONE_SIXTEENTH),
-+      *(sycl::half2*)(&NEG_64));
-+
-+  return result;
-+}
++        seq_len = input_ids.shape[0]
++        device = input_ids.device
++        dtype = input_ids.dtype
 +
-+} // namespace awq
-+} // namespace vllm
-\ No newline at end of file
-diff --git a/csrc/xpu/dtype_float16.h b/csrc/xpu/dtype_float16.h
-new file mode 100644
-index 000000000..1b9c1f248
---- /dev/null
-+++ b/csrc/xpu/dtype_float16.h
-@@ -0,0 +1,458 @@
-+/*
-+ * Adapted from
-+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
-+ * and
-+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
-+ * Copyright (c) 2023, The vLLM team.
-+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+#pragma once
-+
-+#include <dpct/dpct.hpp>
-+#include <sycl/sycl.hpp>
-+#include "attention_generic.h"
-+#include "dtype_float32.h"
-+#include "utils.h"
-+
-+#include <stdint.h>
-+
-+namespace vllm {
-+
-+// FP16 vector types for Q, K, V.
-+template <>
-+struct Vec<sycl::half, 1> {
-+  using Type = sycl::half;
-+};
-+template <>
-+struct Vec<sycl::half, 2> {
-+  using Type = sycl::half2;
-+};
-+template <>
-+struct Vec<sycl::half, 4> {
-+  using Type = sycl::half4;
-+};
-+template <>
-+struct Vec<sycl::half, 8> {
-+  using Type = sycl::half8;
-+};
-+
-+template <>
-+struct FloatVec<sycl::half> {
-+  using Type = float;
-+};
-+template <>
-+struct FloatVec<sycl::half2> {
-+  using Type = sycl::float2;
-+};
-+
-+template <>
-+struct FloatVec<sycl::half4> {
-+  using Type = Float4_;
-+};
-+template <>
-+struct FloatVec<sycl::half8> {
-+  using Type = Float8_;
-+};
-+
-+// Utility functions for type conversions.
-+inline sycl::half2 h0_h0(sycl::half a) {
-+  return sycl::half2{a, a};
-+}
++        if image_grid_thw is not None:
++            image_grid_thw = image_grid_thw.to(device=device, dtype=torch.long)
++        if video_grid_thw is not None:
++            video_grid_thw = video_grid_thw.to(device=device, dtype=torch.long)
 +
-+inline float half_to_float(sycl::half h) {
-+  return float(h);
-+}
++        if second_per_grids is None:
++            if video_grid_thw is not None and video_grid_thw.numel() > 0:
++                second_per_grids = torch.ones(
++                    video_grid_thw.shape[0], dtype=torch.float32, device=device
++                )
++            else:
++                second_per_grids = torch.tensor([], dtype=torch.float32, device=device)
++        else:
++            second_per_grids = second_per_grids.to(device=device, dtype=torch.float32)
++
++        if audio_seqlens is not None:
++            audio_seqlens = audio_seqlens.to(device=device, dtype=torch.long)
++
++        spatial_merge_size = config.vision_config.spatial_merge_size
++        image_token_id = config.image_token_id
++        video_token_id = config.video_token_id
++        audio_token_id = config.audio_token_id
++        vision_start_token_id = config.vision_start_token_id
++        audio_start_token_id = config.audio_start_token_id
++        position_id_per_seconds = config.position_id_per_seconds
++
++        vision_start_indices = torch.argwhere(
++            input_ids == vision_start_token_id
++        ).squeeze(1)
++        if vision_start_indices.numel() > 0:
++            vision_tokens = input_ids[vision_start_indices + 1]
++        else:
++            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
++        audio_nums = torch.sum(input_ids == audio_start_token_id)
++        image_nums = (vision_tokens == image_token_id).sum()
++        video_nums = (
++            (vision_tokens == audio_start_token_id).sum()
++            if use_audio_in_video
++            else (vision_tokens == video_token_id).sum()
++        )
 +
-+inline sycl::float2 half2_to_float2(sycl::half2 v) {
++        input_tokens = input_ids.tolist()
++        llm_pos_ids_list: list[torch.Tensor] = []
++        st = 0
++        image_idx = 0
++        video_idx = 0
++        audio_idx = 0
++        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
++        multimodal_nums = (
++            image_nums + audio_nums
++            if use_audio_in_video
++            else image_nums + video_nums + audio_nums
++        )  # noqa: E501
++
++        for _ in range(multimodal_nums):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
++                remain_videos > 0 or remain_images > 0
++            ):
++                ed_vision_start = input_tokens.index(vision_start_token_id, st)
++            else:
++                ed_vision_start = len(input_tokens) + 1
++            if audio_token_id in input_tokens and remain_audios > 0:
++                ed_audio_start = input_tokens.index(audio_start_token_id, st)
++            else:
++                ed_audio_start = len(input_tokens) + 1
++            min_ed = min(ed_vision_start, ed_audio_start)
++
++            if min_ed == ed_audio_start:
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
++                llm_pos_ids = (
++                    torch.arange(audio_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + audio_len + eos_len
++                audio_idx += 1
++                remain_audios -= 1
++            elif (
++                min_ed == ed_vision_start
++                and input_ids[ed_vision_start + 1] == image_token_id
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                grid_t = image_grid_thw[image_idx][0]
++                grid_hs = image_grid_thw[:, 1]
++                grid_ws = image_grid_thw[:, 2]
++                t_index = torch.arange(grid_t, device=device) * position_id_per_seconds
++                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + image_len + eos_len
++                image_idx += 1
++                remain_images -= 1
++            elif (
++                min_ed == ed_vision_start
++                and input_ids[ed_vision_start + 1] == video_token_id
++                and not use_audio_in_video
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                grid_t = video_grid_thw[video_idx][0]
++                grid_hs = video_grid_thw[:, 1]
++                grid_ws = video_grid_thw[:, 2]
++                t_index = (
++                    torch.arange(grid_t, device=device)
++                    * float(second_per_grids[video_idx].item())
++                    * position_id_per_seconds
++                )
++                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + video_len + eos_len
++                video_idx += 1
++                remain_videos -= 1
++            elif (
++                min_ed == ed_vision_start
++                and ed_vision_start + 1 == ed_audio_start
++                and use_audio_in_video
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                bos_block = (
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(bos_block)
++                llm_pos_ids_list.append(bos_block)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
++                audio_llm_pos_ids = (
++                    torch.arange(audio_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                grid_t = video_grid_thw[video_idx][0]
++                grid_hs = video_grid_thw[:, 1]
++                grid_ws = video_grid_thw[:, 2]
++                t_index = (
++                    torch.arange(grid_t, device=device)
++                    * float(second_per_grids[video_idx].item())
++                    * position_id_per_seconds
++                )
++                video_llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                video_data_index, audio_data_index = 0, 0
++                while (
++                    video_data_index < video_llm_pos_ids.shape[-1]
++                    and audio_data_index < audio_llm_pos_ids.shape[-1]
++                ):
++                    if (
++                        video_llm_pos_ids[0][video_data_index]
++                        <= audio_llm_pos_ids[0][audio_data_index]
++                    ):
++                        llm_pos_ids_list.append(
++                            video_llm_pos_ids[
++                                :, video_data_index : video_data_index + 1
++                            ]
++                        )
++                        video_data_index += 1
++                    else:
++                        llm_pos_ids_list.append(
++                            audio_llm_pos_ids[
++                                :, audio_data_index : audio_data_index + 1
++                            ]
++                        )
++                        audio_data_index += 1
++                if video_data_index < video_llm_pos_ids.shape[-1]:
++                    llm_pos_ids_list.append(
++                        video_llm_pos_ids[
++                            :, video_data_index : video_llm_pos_ids.shape[-1]
++                        ]
++                    )
++                if audio_data_index < audio_llm_pos_ids.shape[-1]:
++                    llm_pos_ids_list.append(
++                        audio_llm_pos_ids[
++                            :, audio_data_index : audio_llm_pos_ids.shape[-1]
++                        ]
++                    )
++                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                eos_block = (
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(eos_block)
++                llm_pos_ids_list.append(eos_block)
++                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
++                audio_idx += 1
++                video_idx += 1
++                remain_videos -= 1
++                remain_audios -= 1
++
++        if st < len(input_tokens):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++            text_len = len(input_tokens) - st
++            llm_pos_ids_list.append(
++                torch.arange(text_len, device=device, dtype=torch.long)
++                .view(1, -1)
++                .expand(3, -1)
++                + st_idx
++            )
 +
-+  return sycl::float2(half_to_float(v.x()), half_to_float(v.y()));
-+}
++        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++        if llm_positions.shape[1] != seq_len:
++            raise RuntimeError("Position ids length mismatch with input ids length")
 +
-+inline sycl::half float_to_half(float f) {
-+  return sycl::half(f);
-+}
++        position_ids = llm_positions.to(device=device, dtype=dtype)
++        mrope_position_delta = llm_positions.max() + 1 - seq_len
++        return position_ids, mrope_position_delta
 +
-+inline sycl::half2 float2_to_half2(sycl::float2 f) {
-+  return sycl::half2{float_to_half(f.x()), float_to_half(f.y())};
-+}
+     @classmethod
+     def _omni_get_input_positions_tensor(
+         cls,
+@@ -879,7 +1359,38 @@ class MRotaryEmbedding(RotaryEmbedding):
+         # TODO(fyabc): refactor and share more code with
+         #  _vl_get_input_positions_tensor.
+ 
++        model_type = hf_config.model_type
+         thinker_config = hf_config.thinker_config
++
++        if isinstance(image_grid_thw, list):
++            image_grid_thw = torch.tensor(image_grid_thw)
++        if isinstance(video_grid_thw, list):
++            video_grid_thw = torch.tensor(video_grid_thw)
++
++        if "qwen3_omni" in model_type:
++            input_tensor = torch.tensor(input_tokens)
++            audio_lengths_tensor = audio_feature_lengths
++            if audio_lengths_tensor is not None and not isinstance(
++                audio_lengths_tensor, torch.Tensor
++            ):
++                audio_lengths_tensor = torch.as_tensor(
++                    audio_lengths_tensor, dtype=torch.long
++                )
++            second_per_grids_tensor = (
++                torch.tensor(second_per_grid_ts) if second_per_grid_ts else None
++            )
 +
-+// Vector addition.
-+inline sycl::half add(sycl::half a, sycl::half b) {
-+  return sycl_half_add(a,b);
-+}
++            llm_positions, mrope_position_delta = cls._omni3_get_input_positions_tensor(  # noqa: E501
++                thinker_config,
++                input_tensor,
++                image_grid_thw,
++                video_grid_thw,
++                use_audio_in_video,
++                audio_lengths_tensor,
++                second_per_grids_tensor,
++            )
++            return llm_positions, mrope_position_delta
++
+         audio_token_id = thinker_config.audio_token_index
+         image_token_id = thinker_config.image_token_index
+         video_token_id = thinker_config.video_token_index
+@@ -892,11 +1403,6 @@ class MRotaryEmbedding(RotaryEmbedding):
+         tokens_per_second = getattr(thinker_config.vision_config,
+                                     "tokens_per_second", 25)
+ 
+-        if isinstance(image_grid_thw, list):
+-            image_grid_thw = torch.tensor(image_grid_thw)
+-        if isinstance(video_grid_thw, list):
+-            video_grid_thw = torch.tensor(video_grid_thw)
+-
+         src_item = input_tokens
+         audio_seqlens = audio_feature_lengths
+         if not second_per_grid_ts:
+@@ -940,7 +1446,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_t = image_grid_thw[image_idx][0]
+                 grid_hs = image_grid_thw[:, 1]
+                 grid_ws = image_grid_thw[:, 2]
+-                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
++                t_index = torch.arange(grid_t) * 1 * tokens_per_second
+                 llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                     start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                     grid_ws)
+@@ -953,9 +1459,11 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_t = video_grid_thw[video_idx][0]
+                 grid_hs = video_grid_thw[:, 1]
+                 grid_ws = video_grid_thw[:, 2]
+-                t_index = (torch.arange(grid_t) *
+-                           second_per_grid_ts[video_idx] *
+-                           tokens_per_second).long()
++                t_index = (
++                    torch.arange(grid_t)
++                    * second_per_grid_ts[video_idx]
++                    * tokens_per_second
++                )
+                 llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                     start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                     grid_ws)
+@@ -976,9 +1484,11 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_hs = video_grid_thw[:, 1]
+                 grid_ws = video_grid_thw[:, 2]
+                 t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+-                t_index = (torch.arange(grid_t) *
+-                           second_per_grid_ts[video_idx] *
+-                           tokens_per_second).long()
++                t_index = (
++                    torch.arange(grid_t)
++                    * second_per_grid_ts[video_idx]
++                    * tokens_per_second
++                )
+                 t_index_split_chunk = cls._split_list_into_ranges(
+                     t_index, t_ntoken_per_chunk)
+                 place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+@@ -1117,10 +1627,8 @@ class MRotaryEmbedding(RotaryEmbedding):
+         grid_h = video_grid_thw[1]
+         grid_w = video_grid_thw[2]
+         t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+-        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+-                   tokens_per_second).long()
+-        t_index_split_chunk = cls._split_list_into_ranges(
+-            t_index, t_ntoken_per_chunk)
++        t_index = torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second
++        t_index_split_chunk = cls._split_list_into_ranges(t_index, t_ntoken_per_chunk)
+ 
+         updates = [audio_start_token_id]
+         added_audio_len = 0
+diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
+index 0c2441a6d..d1747f2d3 100644
+--- a/vllm/model_executor/model_loader/utils.py
++++ b/vllm/model_executor/model_loader/utils.py
+@@ -15,6 +15,7 @@ from typing_extensions import assert_never
+ from vllm.attention import Attention
+ from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
+                          set_current_vllm_config)
++from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.linear import QKVCrossParallelLinear
+ from vllm.model_executor.layers.quantization.base_config import (
+@@ -144,26 +145,30 @@ def device_loading_context(module: torch.nn.Module,
+         yield module
+ 
+     finally:
+-        # Restore parameters to their original devices, ignoring new parameters
+-        pin_memory = is_pin_memory_available()
+-        for name, p in module.named_parameters():
+-            if name in original_device_states:
+-                original_device: torch.device = original_device_states[name]
+-                if original_device.type == "cpu":
+-                    # `torch.empty_like` does not support `pin_memory` argument
+-                    cpu_data = torch.empty_strided(
+-                        size=p.data.size(),
+-                        stride=p.data.stride(),
+-                        dtype=p.data.dtype,
+-                        layout=p.data.layout,
+-                        device="cpu",
+-                        pin_memory=pin_memory,
+-                    )
+-                    cpu_data.copy_(p.data)
+-                    p.data = cpu_data
+-                else:
+-                    p.data = p.data.to(original_device)
+-        # New parameters or parameters already on target device are untouched
++        # If weights were loaded onto the CPU for FP8 online quantization, there
++        # is no need to move them back to the original device.
++        if not VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
++            # Restore parameters to their original devices, ignoring new parameters # noqa: E501
++            pin_memory = is_pin_memory_available()
++            for name, p in module.named_parameters():
++                if name in original_device_states:
++                    original_device: torch.device = original_device_states[
++                        name]
++                    if original_device.type == "cpu":
++                        # `torch.empty_like` does not support `pin_memory` argument # noqa: E501
++                        cpu_data = torch.empty_strided(
++                            size=p.data.size(),
++                            stride=p.data.stride(),
++                            dtype=p.data.dtype,
++                            layout=p.data.layout,
++                            device="cpu",
++                            pin_memory=pin_memory,
++                        )
++                        cpu_data.copy_(p.data)
++                        p.data = cpu_data
++                    else:
++                        p.data = p.data.to(original_device)
++            # New parameters or parameters already on target device are untouched # noqa: E501
+ 
+ 
+ def get_model_architecture(
+diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
+new file mode 100644
+index 000000000..f24cb6d52
+--- /dev/null
++++ b/vllm/model_executor/models/dots_ocr.py
+@@ -0,0 +1,861 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++from collections.abc import Iterable, Mapping
++from typing import Literal, Optional, TypedDict, Union
 +
-+inline sycl::half2 add(sycl::half2 a, sycl::half2 b) {
-+  auto val = sycl_half_add2(a, b);
-+  return (val);
-+}
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from torch.nn import LayerNorm
++from transformers.modeling_utils import PreTrainedModel
++from transformers.models.qwen2_vl import Qwen2VLProcessor
 +
-+inline sycl::half4 add(sycl::half4 a, sycl::half4 b) {
-+  sycl::half4 c;
-+  c.x() = add(a.x(), b.x());
-+  c.y() = add(a.y(), b.y());
-+  c.z() = add(a.z(), b.z());
-+  c.w() = add(a.w(), b.w());
-+  return c;
-+}
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.models.interfaces import (MultiModalEmbeddings,
++                                                   SupportsMultiModal,
++                                                   SupportsPP)
++from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
++from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder,
++                                                 Qwen2VLMultiModalProcessor,
++                                                 Qwen2VLProcessingInfo)
++from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
++                                              init_vllm_registered_model,
++                                              maybe_prefix,
++                                              merge_multimodal_embeddings)
++from vllm.model_executor.models.vision import get_vit_attn_backend
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import MultiModalDataDict
++from vllm.platforms import _Backend
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.dotsocr import (DotsOCRConfig,
++                                                     DotsVisionConfig)
 +
-+inline sycl::half8 add(sycl::half8 a, sycl::half8 b) {
-+  sycl::half8 c;
-+  c.s0() = add(a.s0(), b.s0());
-+  c.s1() = add(a.s1(), b.s1());
-+  c.s2() = add(a.s2(), b.s2());
-+  c.s3() = add(a.s3(), b.s3());
-+  c.s4() = add(a.s4(), b.s4());
-+  c.s5() = add(a.s5(), b.s5());
-+  c.s6() = add(a.s6(), b.s6());
-+  c.s7() = add(a.s7(), b.s7());
-+  return c;
-+}
++IMAGE_TOKEN = "<|imgpad|>"
 +
-+inline sycl::float2 add(sycl::half2 a, sycl::float2 fb) {
-+  sycl::float2 fa = half2_to_float2(a);
-+  return add(fa, fb);
-+}
 +
-+inline Float4_ add(sycl::half4 a, Float4_ fb) {
-+  Float4_ fc;
-+  fc.x = add(sycl::half2{a.x(), a.y()}, fb.x);
-+  fc.y = add(sycl::half2{a.z(), a.w()}, fb.y);
-+  return fc;
-+}
++class DotsOCRImagePixelInputs(TypedDict):
++    type: Literal["pixel_values", "image_grid_thw"]
 +
-+inline Float8_ add(sycl::half8 a, Float8_ fb) {
-+  Float8_ fc;
-+  fc.x = add(sycl::half2{a.s0(), a.s1()}, fb.x);
-+  fc.y = add(sycl::half2{a.s2(), a.s3()}, fb.y);
-+  fc.z = add(sycl::half2{a.s4(), a.s5()}, fb.z);
-+  fc.w = add(sycl::half2{a.s6(), a.s7()}, fb.w);
-+  return fc;
-+}
++    pixel_values: torch.Tensor
++    image_grid_thw: torch.Tensor
++
++
++class DotsOCRImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds", "image_grid_thw"]
++    image_embeds: torch.Tensor
++    """Supported types:
++    - List[`torch.Tensor`]: A list of tensors holding all images' features.
++        Each tensor holds an image's features.
++    - `torch.Tensor`: A tensor holding all images' features
++        (concatenation of all images' feature tensors).
++    Tensor shape: `(num_image_features, hidden_size)`
++    - `num_image_features` varies based on
++        the number and resolution of the images.
++    - `hidden_size` must match the hidden size of language model backbone.
++    """
 +
-+// Vector multiplication.
-+template <>
-+inline sycl::half mul(sycl::half a, sycl::half b) {
-+  auto val = sycl_half_mul((a), (b));
-+  return (val);
-+}
++    image_grid_thw: torch.Tensor
 +
-+template <>
-+inline sycl::half2 mul(sycl::half2 a, sycl::half2 b) {
-+  auto val = sycl_half_mul2((a), (b));
-+  return (val);
-+}
 +
-+template <>
-+inline sycl::half2 mul(sycl::half a, sycl::half2 b) {
-+  return mul<sycl::half2, sycl::half2, sycl::half2>(h0_h0(a), b);
-+}
++DotsOCRImageInputs = Union[DotsOCRImagePixelInputs,
++                           DotsOCRImageEmbeddingInputs]
 +
 +
-+template <>
-+inline sycl::half4 mul(sycl::half4 a, sycl::half4 b) {
-+  sycl::half4 c;
-+  c.x() = mul<sycl::half, sycl::half, sycl::half>(a.x(), b.x());
-+  c.y() = mul<sycl::half, sycl::half, sycl::half>(a.y(), b.y());
-+  c.z() = mul<sycl::half, sycl::half, sycl::half>(a.z(), b.z());
-+  c.w() = mul<sycl::half, sycl::half, sycl::half>(a.w(), b.w());
-+  return c;
-+}
++class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
 +
-+template <>
-+inline sycl::half4 mul(sycl::half a, sycl::half4 b) {
-+  sycl::half4 c;
-+  c.x() = mul<sycl::half, sycl::half, sycl::half>(a, b.x());
-+  c.y() = mul<sycl::half, sycl::half, sycl::half>(a, b.y());
-+  c.z() = mul<sycl::half, sycl::half, sycl::half>(a, b.z());
-+  c.w() = mul<sycl::half, sycl::half, sycl::half>(a, b.w());
-+  return c;
-+}
++    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
++        num_images = mm_counts.get("image", 0)
++        return IMAGE_TOKEN * num_images
 +
-+template <>
-+inline sycl::half8 mul(sycl::half8 a, sycl::half8 b) {
-+  sycl::half8 c;
-+  c.s0() = mul<sycl::half, sycl::half, sycl::half>(a.s0(), b.s0());
-+  c.s1() = mul<sycl::half, sycl::half, sycl::half>(a.s1(), b.s1());
-+  c.s2() = mul<sycl::half, sycl::half, sycl::half>(a.s2(), b.s2());
-+  c.s3() = mul<sycl::half, sycl::half, sycl::half>(a.s3(), b.s3());
-+  c.s4() = mul<sycl::half, sycl::half, sycl::half>(a.s4(), b.s4());
-+  c.s5() = mul<sycl::half, sycl::half, sycl::half>(a.s5(), b.s5());
-+  c.s6() = mul<sycl::half, sycl::half, sycl::half>(a.s6(), b.s6());
-+  c.s7() = mul<sycl::half, sycl::half, sycl::half>(a.s7(), b.s7());
-+  return c;
-+}
++    def get_dummy_mm_data(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> MultiModalDataDict:
++        num_images = mm_counts.get("image", 0)
 +
-+template <>
-+inline sycl::half8 mul(sycl::half a, sycl::half8 b) {
-+  sycl::half8 c;
-+  c.s0() = mul<sycl::half, sycl::half, sycl::half>(a, b.s0());
-+  c.s1() = mul<sycl::half, sycl::half, sycl::half>(a, b.s1());
-+  c.s2() = mul<sycl::half, sycl::half, sycl::half>(a, b.s2());
-+  c.s3() = mul<sycl::half, sycl::half, sycl::half>(a, b.s3());
-+  c.s4() = mul<sycl::half, sycl::half, sycl::half>(a, b.s4());
-+  c.s5() = mul<sycl::half, sycl::half, sycl::half>(a, b.s5());
-+  c.s6() = mul<sycl::half, sycl::half, sycl::half>(a, b.s6());
-+  c.s7() = mul<sycl::half, sycl::half, sycl::half>(a, b.s7());
-+  return c;
-+}
++        target_width, target_height = self.info.get_image_size_with_most_features(  # noqa: E501
++        )
 +
-+template <>
-+inline float mul(sycl::half a, sycl::half b) {
-+  float fa = half_to_float(a);
-+  float fb = half_to_float(b);
-+  return fa * fb;
-+}
++        return {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++        }
 +
-+template <>
-+inline sycl::float2 mul(sycl::half2 a, sycl::half2 b) {
-+  sycl::float2 fa = half2_to_float2(a);
-+  sycl::float2 fb = half2_to_float2(b);
-+  return mul<sycl::float2, sycl::float2, sycl::float2>(fa, fb);
-+}
 +
-+template <>
-+inline sycl::float2 mul(sycl::half a, sycl::half2 b) {
-+  return mul<sycl::float2, sycl::half2, sycl::half2>(h0_h0(a), b);
-+}
++class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
 +
-+template <>
-+inline Float4_ mul(sycl::half4 a, sycl::half4 b) {
-+  Float4_ fc;
-+  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.x(), a.y()}, sycl::half2{b.x(), b.y()});
-+  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.z(), a.w()}, sycl::half2{b.z(), b.w()});
-+  return fc;
-+}
++    def get_hf_config(self) -> DotsOCRConfig:
++        config = self.ctx.get_hf_config()
++        if not config.__class__.__name__ == 'DotsOCRConfig':
++            raise TypeError(f"Expected DotsOCRConfig, got {type(config)}")
 +
-+template <>
-+inline Float4_ mul(sycl::half a, sycl::half4 b) {
-+  sycl::half2 s = h0_h0(a);
-+  Float4_ fc;
++        if hasattr(config, "vision_config") and isinstance(
++                config.vision_config, dict):
++            config.vision_config = DotsVisionConfig(**config.vision_config)
 +
-+  fc.x =
-+      mul<sycl::float2, sycl::half2, sycl::half2>(s, sycl::half2{b.x(), b.y()});
-+  fc.y =
-+      mul<sycl::float2, sycl::half2, sycl::half2>(s, sycl::half2{b.z(), b.w()});
-+  return fc;
-+}
++        return config
 +
-+template <>
-+inline Float8_ mul(sycl::half8 a, sycl::half8 b) {
-+  Float8_ fc;
-+  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.s0(), a.s1()}, sycl::half2{b.s0(), b.s1()});
-+  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.s2(), a.s3()}, sycl::half2{b.s2(), b.s3()});
-+  fc.z = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.s4(), a.s5()}, sycl::half2{b.s4(), b.s5()});
-+  fc.w = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      sycl::half2{a.s6(), a.s7()}, sycl::half2{b.s6(), b.s7()});
-+  return fc;
-+}
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
 +
-+template <>
-+inline Float8_ mul(sycl::half a, sycl::half8 b) {
-+  sycl::half2 s = h0_h0(a);
-+  Float8_ fc;
-+  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      s, sycl::half2{b.s0(), b.s1()});
-+  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      s, sycl::half2{b.s2(), b.s3()});
-+  fc.z = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      s, sycl::half2{b.s4(), b.s5()});
-+  fc.w = mul<sycl::float2, sycl::half2, sycl::half2>(
-+      s, sycl::half2{b.s6(), b.s7()});
-+  return fc;
-+}
++    def get_mm_max_tokens_per_item(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> Mapping[str, int]:
++        max_image_tokens = self.get_max_image_tokens()
++        return {"image": max_image_tokens}
 +
-+// Vector fused multiply-add.
-+inline sycl::half2 fma(sycl::half2 a, sycl::half2 b, sycl::half2 c) {
-+  auto val = sycl_half_fma2((a), (b), (c));
-+  return (val);
-+}
++    def get_hf_processor(
++        self,
++        **kwargs: object,
++    ) -> Qwen2VLProcessor:
++        self.get_tokenizer(
++        ).image_token = IMAGE_TOKEN  # Ensure image token is set
++        processor = self.ctx.get_hf_processor(
++            Qwen2VLProcessor,
++            **kwargs,
++        )
++        processor.image_token = IMAGE_TOKEN
++        processor.video_token = "<|video_pad|>"
++        return processor
 +
-+inline sycl::half2 fma(sycl::half a, sycl::half2 b, sycl::half2 c) {
-+  return fma(h0_h0(a), b, c);
-+}
 +
-+inline sycl::half4 fma(sycl::half4 a, sycl::half4 b, sycl::half4 c) {
-+  sycl::half4 d;
-+  d.x() = fma(a.x(), b.x(), c.x());
-+  d.y() = fma(a.y(), b.y(), c.y());
-+  d.z() = fma(a.z(), b.z(), c.z());
-+  d.w() = fma(a.w(), b.w(), c.w());
-+  return d;
-+}
++def rotate_half(x):
++    """Rotates half the hidden dims of the input."""
++    x1 = x[..., :x.shape[-1] // 2]
++    x2 = x[..., x.shape[-1] // 2:]
++    return torch.cat((-x2, x1), dim=-1)
 +
-+inline sycl::half4 fma(sycl::half a, sycl::half4 b, sycl::half4 c) {
-+  sycl::half4 s = sycl::half4{a, a, a, a};
-+  return fma(s, b, c);
-+}
 +
-+inline sycl::half8 fma(sycl::half8 a, sycl::half8 b, sycl::half8 c) {
-+  sycl::half8 d;
-+  d.s0() = fma(a.s0(), b.s0(), c.s0());
-+  d.s1() = fma(a.s1(), b.s1(), c.s1());
-+  d.s2() = fma(a.s2(), b.s2(), c.s2());
-+  d.s3() = fma(a.s3(), b.s3(), c.s3());
-+  d.s4() = fma(a.s4(), b.s4(), c.s4());
-+  d.s5() = fma(a.s5(), b.s5(), c.s5());
-+  d.s6() = fma(a.s6(), b.s6(), c.s6());
-+  d.s7() = fma(a.s7(), b.s7(), c.s7());
-+  return d;
-+}
++def apply_rotary_pos_emb_vision(tensor: torch.Tensor,
++                                freqs: torch.Tensor) -> torch.Tensor:
++    orig_dtype = tensor.dtype
++    tensor = tensor.float()
 +
-+inline sycl::half8 fma(sycl::half a, sycl::half8 b, sycl::half8 c) {
-+  sycl::half8 d;
-+  d.s0() = fma(a, b.s0(), c.s0());
-+  d.s1() = fma(a, b.s1(), c.s1());
-+  d.s2() = fma(a, b.s2(), c.s2());
-+  d.s3() = fma(a, b.s3(), c.s3());
-+  d.s4() = fma(a, b.s4(), c.s4());
-+  d.s5() = fma(a, b.s5(), c.s5());
-+  d.s6() = fma(a, b.s6(), c.s6());
-+  d.s7() = fma(a, b.s7(), c.s7());
-+  return d;
-+}
++    cos = freqs.cos()
++    sin = freqs.sin()
 +
-+inline float fma(sycl::half a, sycl::half b, float fc) {
-+  float fa = half_to_float(a);
-+  float fb = half_to_float(b);
-+  return sycl::fma(fa, fb, fc);
-+}
++    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
++    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
 +
-+inline sycl::float2 fma(sycl::half2 a, sycl::half2 b, sycl::float2 fc) {
-+  sycl::float2 fa = half2_to_float2(a);
-+  sycl::float2 fb = half2_to_float2(b);
-+  return fma(fa, fb, fc);
-+}
++    output = (tensor * cos) + (rotate_half(tensor) * sin)
 +
-+inline sycl::float2 fma(sycl::half a, sycl::half2 b, sycl::float2 fc) {
-+  return fma(h0_h0(a), b, fc);
-+}
++    output = output.to(orig_dtype)
 +
-+inline Float4_ fma(sycl::half4 a, sycl::half4 b, Float4_ fc) {
-+  Float4_ fd;
-+  fd.x = fma(sycl::half2{a.x(), a.y()}, sycl::half2{b.x(), b.y()}, fc.x);
-+  fd.y = fma(sycl::half2{a.z(), a.w()}, sycl::half2{b.z(), b.w()}, fc.y);
-+  return fd;
-+}
++    return output
 +
-+inline Float4_ fma(sycl::half a, sycl::half4 b, Float4_ fc) {
-+  sycl::half4 s = sycl::half4{a, a, a, a};
 +
-+  return fma(s, b, fc);
-+}
++class VisionRotaryEmbedding(nn.Module):
 +
-+inline Float8_ fma(sycl::half8 a, sycl::half8 b, Float8_ fc) {
-+  Float8_ fd;
-+  fd.x = fma(sycl::half2{a.s0(), a.s1()}, sycl::half2{b.s0(), b.s1()}, fc.x);
-+  fd.y = fma(sycl::half2{a.s2(), a.s3()}, sycl::half2{b.s2(), b.s3()}, fc.y);
-+  fd.z = fma(sycl::half2{a.s4(), a.s5()}, sycl::half2{b.s4(), b.s5()}, fc.z);
-+  fd.w = fma(sycl::half2{a.s6(), a.s7()}, sycl::half2{b.s6(), b.s7()}, fc.w);
-+  return fd;
-+}
++    def __init__(self, dim: int, theta: float = 10000.0) -> None:
++        super().__init__()
++        inv_freq = 1.0 / (theta
++                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
++        self.register_buffer("inv_freq", inv_freq, persistent=False)
 +
-+inline Float8_ fma(sycl::half a, sycl::half8 b, Float8_ fc) {
-+  sycl::half8 s = sycl::half8{a, a, a, a, a, a, a, a};
++    def forward(self, seqlen: int) -> torch.Tensor:
++        seq = torch.arange(seqlen,
++                           device=self.inv_freq.device,
++                           dtype=self.inv_freq.dtype)
++        freqs = torch.outer(seq, self.inv_freq)
++        return freqs
 +
-+  return fma(s, b, fc);
-+}
 +
-+// Vector sum.
-+template <>
-+inline float sum(sycl::half v) {
-+  return half_to_float(v);
-+}
++class PatchMerger(nn.Module):
 +
-+template <>
-+inline float sum(sycl::half2 v) {
-+  sycl::float2 tmp = half2_to_float2(v);
-+  return tmp.x() + tmp.y();
-+}
++    def __init__(
++        self,
++        dim: int,
++        context_dim: int,
++        spatial_merge_size: int = 2,
++        pre_norm="layernorm",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++        self.pre_norm = pre_norm
++        if self.pre_norm == "layernorm":
++            self.ln_q = LayerNorm(context_dim, eps=1e-6)
++        elif self.pre_norm == "rmsnorm":
++            self.ln_q = RMSNorm(context_dim, eps=1e-6)
++        else:
++            print("no norm in patch merger")
++
++        self.mlp = nn.Sequential(
++            ColumnParallelLinear(self.hidden_size,
++                                 self.hidden_size,
++                                 bias=True,
++                                 return_bias=False,
++                                 disable_tp=True),
++            nn.GELU(),
++            RowParallelLinear(self.hidden_size,
++                              dim,
++                              bias=True,
++                              return_bias=False,
++                              disable_tp=True),
++        )
 +
-+template <>
-+inline float sum(sycl::half4 v) {
-+  sycl::half2 c = add(sycl::half2{v.x(), v.y()}, sycl::half2{v.z(), v.w()});
-+  return sum(c);
-+}
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.pre_norm:
++            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
++        else:
++            x = self.mlp(x.view(-1, self.hidden_size))
++        return x
 +
-+template <>
-+inline float sum(sycl::half8 v) {
-+  return add(
-+      sum(sycl::half4{v.s0(), v.s1(), v.s2(), v.s3()}),
-+      sum(sycl::half4{v.s4(), v.s5(), v.s6(), v.s7()}));
-+}
 +
-+inline void from_float(sycl::half& dst, float src) {
-+  dst = sycl::half(src);
-+}
++class DotsVisionAttention(nn.Module):
 +
-+inline void from_float(sycl::half2& dst, sycl::float2 src) {
-+  dst = float2_to_half2(src);
-+}
++    def __init__(self,
++                 config,
++                 dim: int,
++                 num_heads: int = 16,
++                 bias: bool = True,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
++        super().__init__()
++        from vllm.distributed import (parallel_state,
++                                      tensor_model_parallel_all_gather)
++        from vllm.distributed import utils as dist_utils
 +
-+inline void from_float(sycl::half4& dst, Float4_ src) {
-+  sycl::half2 h0 = float2_to_half2(src.x);
-+  sycl::half2 h1 = float2_to_half2(src.y);
-+  dst.x() = h0.x();
-+  dst.y() = h0.y();
-+  dst.z() = h1.x();
-+  dst.w() = h1.y();
-+}
++        self.embed_dim = dim
++        self.num_heads = num_heads
++        self.head_dim = dim // num_heads
++        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
++        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
++        self.num_heads_per_partition = dist_utils.divide(
++            num_heads, self.tp_size)
++
++        # qkv/proj follow Qwen2-VL style; bias controlled by arg
++        self.qkv = QKVParallelLinear(hidden_size=dim,
++                                     head_size=dim // num_heads,
++                                     total_num_heads=num_heads,
++                                     bias=bias,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.qkv")
++        self.proj = RowParallelLinear(input_size=dim,
++                                      output_size=dim,
++                                      bias=bias,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.proj")
++        self._all_gather = tensor_model_parallel_all_gather
++        self._split_last = dist_utils.split_tensor_along_last_dim
++
++        # Select attention backend
++        self.attn_backend = get_vit_attn_backend(self.head_dim,
++                                                 torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++                check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++        if self.attn_backend not in {
++                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
++        }:
++            raise RuntimeError(
++                f"Unsupported vision attention backend: {self.attn_backend}")
++        self.is_flash_attn_backend = self.attn_backend in {
++            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
++        }
 +
-+inline void from_float(sycl::half8& dst, Float8_ src) {
-+  dst.s0() = float2_to_half2(src.x).x();
-+  dst.s1() = float2_to_half2(src.x).y();
-+  dst.s2() = float2_to_half2(src.y).x();
-+  dst.s3() = float2_to_half2(src.y).y();
-+  dst.s4() = float2_to_half2(src.z).x();
-+  dst.s5() = float2_to_half2(src.z).y();
-+  dst.s6() = float2_to_half2(src.w).x();
-+  dst.s7() = float2_to_half2(src.w).y();
-+}
++    def _split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
++        # qkv: [S, B, 3*dim]
++        seq_len, bs, _ = qkv.shape
++        if self.tp_size > 1:
++            qkv = self._all_gather(qkv)
++        q, k, v = qkv.chunk(3, dim=2)
++        if self.tp_size > 1:
++            q = self._split_last(q, num_partitions=self.tp_size)[self.tp_rank]
++            k = self._split_last(k, num_partitions=self.tp_size)[self.tp_rank]
++            v = self._split_last(v, num_partitions=self.tp_size)[self.tp_rank]
++        new_shape = (seq_len, bs, self.num_heads_per_partition, self.head_dim)
++        return (q.view(*new_shape), k.view(*new_shape), v.view(*new_shape))
 +
-+// From float16 to float32.
-+inline float to_float(sycl::half u) {
-+  return half_to_float(u);
-+}
-+
-+inline sycl::float2 to_float(sycl::half2 u) {
-+  return half2_to_float2(u);
-+}
-+
-+inline Float4_ to_float(sycl::half4 u) {
-+  Float4_ tmp;
-+  tmp.x = half2_to_float2(sycl::half2{u.x(), u.y()});
-+  tmp.y = half2_to_float2(sycl::half2{u.z(), u.w()});
-+  return tmp;
-+}
-+
-+inline Float8_ to_float(sycl::half8 u) {
-+  Float8_ tmp;
-+  tmp.x = half2_to_float2(sycl::half2{u.s0(), u.s1()});
-+  tmp.y = half2_to_float2(sycl::half2{u.s2(), u.s3()});
-+  tmp.z = half2_to_float2(sycl::half2{u.s4(), u.s5()});
-+  tmp.w = half2_to_float2(sycl::half2{u.s6(), u.s7()});
-+  return tmp;
-+}
-+
-+// Zero-out a variable.
-+inline void zero(sycl::half& dst) {
-+  dst = sycl::half(0);
-+}
-+
-+} // namespace vllm
-\ No newline at end of file
-diff --git a/csrc/xpu/dtype_float32.h b/csrc/xpu/dtype_float32.h
-new file mode 100644
-index 000000000..7b70e4efc
---- /dev/null
-+++ b/csrc/xpu/dtype_float32.h
-@@ -0,0 +1,268 @@
-+/*
-+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
-+ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
-+ * Copyright (c) 2023, The vLLM team.
-+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+#pragma once
-+
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+#include "attention_generic.h"
-+
-+#include <stdint.h>
-+
-+namespace vllm {
-+
-+// Define custom FP32 vector data types.
-+struct Float4_ {
-+  sycl::float2 x;
-+  sycl::float2 y;
-+};
-+
-+struct Float8_ {
-+  sycl::float2 x;
-+  sycl::float2 y;
-+  sycl::float2 z;
-+  sycl::float2 w;
-+};
-+
-+// FP32 vector types for Q, K, V.
-+template<>
-+struct Vec<float, 1> {
-+  using Type = float;
-+};
-+template<>
-+struct Vec<float, 2> {
-+  using Type = sycl::float2;
-+};
-+template<>
-+struct Vec<float, 4> {
-+  using Type = sycl::float4;
-+};
-+
-+// FP32 accumulator vector types corresponding to Vec.
-+template<>
-+struct FloatVec<float> {
-+  using Type = float;
-+};
-+template <> struct FloatVec<sycl::float2> {
-+  using Type = sycl::float2;
-+};
-+template <> struct FloatVec<sycl::float4> {
-+  using Type = sycl::float4;
-+};
-+
-+// Vector addition.
-+inline float add(float a, float b) {
-+  return a + b;
-+}
-+
-+inline sycl::float2 add(sycl::float2 a, sycl::float2 b) {
-+  sycl::float2 c;
-+  c.x() = add(a.x(), b.x());
-+  c.y() = add(a.y(), b.y());
-+  return c;
-+}
-+
-+inline sycl::float4 add(sycl::float4 a, sycl::float4 b) {
-+  sycl::float4 c;
-+  c.x() = add(a.x(), b.x());
-+  c.y() = add(a.y(), b.y());
-+  c.z() = add(a.z(), b.z());
-+  c.w() = add(a.w(), b.w());
-+  return c;
-+}
-+
-+// Vector multiplication.
-+template<>
-+inline float mul<float, float>(float a, float b) {
-+  return a * b;
-+}
-+
-+template <> inline sycl::float2 mul(sycl::float2 a, sycl::float2 b) {
-+  sycl::float2 c;
-+  c.x() = a.x() * b.x();
-+  c.y() = a.y() * b.y();
-+  return c;
-+}
-+
-+template <> inline sycl::float2 mul(float a, sycl::float2 b) {
-+  sycl::float2 c;
-+  c.x() = a * b.x();
-+  c.y() = a * b.y();
-+  return c;
-+}
-+
-+template <> inline sycl::float4 mul(sycl::float4 a, sycl::float4 b) {
-+  sycl::float4 c;
-+  c.x() = a.x() * b.x();
-+  c.y() = a.y() * b.y();
-+  c.z() = a.z() * b.z();
-+  c.w() = a.w() * b.w();
-+  return c;
-+}
-+
-+template <> inline sycl::float4 mul(float a, sycl::float4 b) {
-+  sycl::float4 c;
-+  c.x() = a * b.x();
-+  c.y() = a * b.y();
-+  c.z() = a * b.z();
-+  c.w() = a * b.w();
-+  return c;
-+}
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        cu_seqlens: torch.Tensor,
++        rotary_pos_emb: Optional[torch.Tensor] = None,
++        *,
++        max_seqlen: Optional[int] = None,
++        seqlens: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++        # [S, C] -> [S, B=1, C]
++        x = hidden_states.unsqueeze(1)
++        x, _ = self.qkv(x)
++        q, k, v = self._split_qkv(x)
++        bs = q.shape[1]
++        # [S,B,H,D] -> [B,S,H,D]
++        q = q.permute(1, 0, 2, 3).contiguous()
++        k = k.permute(1, 0, 2, 3).contiguous()
++        v = v.permute(1, 0, 2, 3).contiguous()
++
++        if rotary_pos_emb is not None:
++            qk_concat = torch.cat([q, k], dim=0)
++            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
++            q, k = torch.chunk(qk_rotated, 2, dim=0)
++
++        if self.is_flash_attn_backend:
++            if self.attn_backend == _Backend.ROCM_AITER_FA:
++                from aiter import flash_attn_varlen_func
++            else:
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
++            q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
++            k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
++            v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
++            output = flash_attn_varlen_func(q_,
++                                            k_,
++                                            v_,
++                                            cu_seqlens_q=cu_seqlens,
++                                            cu_seqlens_k=cu_seqlens,
++                                            max_seqlen_q=max_seqlen,
++                                            max_seqlen_k=max_seqlen,
++                                            dropout_p=0.0,
++                                            causal=False)
++            context_layer = output.view(bs, -1, self.num_heads_per_partition,
++                                        self.head_dim)
++        elif self.attn_backend == _Backend.TORCH_SDPA:
++            outputs = []
++            for i in range(1, len(cu_seqlens)):
++                s = int(cu_seqlens[i - 1])
++                e = int(cu_seqlens[i])
++                q_i = q[:, s:e].permute(0, 2, 1, 3)
++                k_i = k[:, s:e].permute(0, 2, 1, 3)
++                v_i = v[:, s:e].permute(0, 2, 1, 3)
++                out_i = F.scaled_dot_product_attention(q_i,
++                                                       k_i,
++                                                       v_i,
++                                                       dropout_p=0.0)
++                out_i = out_i.permute(0, 2, 1, 3)
++                outputs.append(out_i)
++            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
++        elif self.attn_backend == _Backend.XFORMERS:
++            from xformers import ops as xops
++            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
++            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
++                                                       kv_seqlen=None,
++                                                       device=q.device)
++            context_layer = xops.memory_efficient_attention_forward(
++                q, k, v, attn_bias=attn_bias, p=0, scale=None)
++        elif self.attn_backend == _Backend.IPEX:
++            q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
++            k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
++            v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
++            output = torch.empty_like(q_)
 +
-+// Vector fused multiply-add.
-+inline float fma(float a, float b, float c) {
-+  return a * b + c;
-+}
++            from vllm._ipex_ops import ipex_ops
++            ipex_ops.varlen_attention(
++                q_.contiguous(),                   # query
++                k_.contiguous(),                   # key
++                v_.contiguous(),                   # value
++                output,                      # out
++                cu_seqlens.int(),                 # seqlen_q
++                cu_seqlens.int(),                 # seqlen_k
++                None,                             # alibi_slopes
++                max_seqlen,                       # max_seqlen_q
++                max_seqlen,                       # max_seqlen_k
++                0.0,                              # pdropout
++                1.0 / (q.shape[-1] ** 0.5),       # softmax_scale
++                False,                            # zero_tensors
++                False,                            # is_causal
++                False,                            # return_softmax
++                None,                             # gen_
++                -1,                               # window_size_left
++                -1,                               # window_size_right
++                -1,                               # logits_soft_cap
++            )
++            context_layer = output.view(bs, -1, self.num_heads_per_partition,
++                                        self.head_dim)
++        else:
++            raise RuntimeError("Unsupported attention backend")
 +
-+inline sycl::float2 fma(sycl::float2 a, sycl::float2 b, sycl::float2 c) {
-+  sycl::float2 d;
-+  d.x() = fma(a.x(), b.x(), c.x());
-+  d.y() = fma(a.y(), b.y(), c.y());
-+  return d;
-+}
++        # [B,S,H,D] -> [S,B,H*D] -> [S, C]
++        context_layer = context_layer.permute(1, 0, 2, 3).contiguous()
++        context_layer = context_layer.view(context_layer.shape[0], bs, -1)
++        out, _ = self.proj(context_layer)
++        return out.squeeze(1)
 +
-+inline sycl::float2 fma(float a, sycl::float2 b, sycl::float2 c) {
-+  sycl::float2 d;
-+  d.x() = fma(a, b.x(), c.x());
-+  d.y() = fma(a, b.y(), c.y());
-+  return d;
-+}
 +
-+inline sycl::float4 fma(sycl::float4 a, sycl::float4 b, sycl::float4 c) {
-+  sycl::float4 d;
-+  d.x() = fma(a.x(), b.x(), c.x());
-+  d.y() = fma(a.y(), b.y(), c.y());
-+  d.z() = fma(a.z(), b.z(), c.z());
-+  d.w() = fma(a.w(), b.w(), c.w());
-+  return d;
-+}
++class DotsSwiGLUFFN(nn.Module):
 +
-+inline sycl::float4 fma(float a, sycl::float4 b, sycl::float4 c) {
-+  sycl::float4 d;
-+  d.x() = fma(a, b.x(), c.x());
-+  d.y() = fma(a, b.y(), c.y());
-+  d.z() = fma(a, b.z(), c.z());
-+  d.w() = fma(a, b.w(), c.w());
-+  return d;
-+}
++    def __init__(self,
++                 config,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        hidden_features = config.intermediate_size
++        in_features = config.embed_dim
++        bias = config.use_bias
++
++        # Referenced aimv2.py AIMv2SwiGLUFFN
++        self.fc13 = MergedColumnParallelLinear(in_features,
++                                               [hidden_features] * 2,
++                                               bias=bias,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.fc13",
++                                               disable_tp=True)
++        self.fc2 = RowParallelLinear(hidden_features,
++                                     in_features,
++                                     bias=bias,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2",
++                                     disable_tp=True)
++        self.act_fn = SiluAndMul()
 +
-+inline Float4_ fma(float a, Float4_ b, Float4_ c) {
-+  Float4_ d;
-+  d.x = fma(a, b.x, c.x);
-+  d.y = fma(a, b.y, c.y);
-+  return d;
-+}
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x, _ = self.fc13(x)
++        x = self.act_fn(x)
++        x, _ = self.fc2(x)
++        return x
 +
-+inline Float8_ fma(float a, Float8_ b, Float8_ c) {
-+  Float8_ d;
-+  d.x = fma(a, b.x, c.x);
-+  d.y = fma(a, b.y, c.y);
-+  d.z = fma(a, b.z, c.z);
-+  d.w = fma(a, b.w, c.w);
-+  return d;
-+}
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        params = dict(self.named_parameters())
++        loaded: set[str] = set()
++        for name, w in weights:
++            # Map fc1 -> fc13 (shard 0)
++            if name.startswith("fc1."):
++                tgt = name.replace("fc1.", "fc13.")
++                if tgt in params:
++                    params[tgt].weight_loader(params[tgt], w, 0)
++                    loaded.add(tgt)
++                continue
++            # Map fc3 -> fc13 (shard 1)
++            if name.startswith("fc3."):
++                tgt = name.replace("fc3.", "fc13.")
++                if tgt in params:
++                    params[tgt].weight_loader(params[tgt], w, 1)
++                    loaded.add(tgt)
++                continue
++            # Pass-through for fc2 and others
++            if name in params:
++                params[name].weight_loader(params[name], w)
++                loaded.add(name)
++        return loaded
 +
-+// Vector sum.
-+template<>
-+inline float sum(float v) {
-+  return v;
-+}
 +
-+template <> inline float sum(sycl::float2 v) {
-+  return v.x() + v.y();
-+}
++class DotsPatchEmbed(nn.Module):
 +
-+template <> inline float sum(sycl::float4 v) {
-+  return v.x() + v.y() + v.z() + v.w();
-+}
++    def __init__(self, config):
++        super().__init__()
++        self.num_channels = config.num_channels
++        self.patch_size = config.patch_size
++        self.temporal_patch_size = config.temporal_patch_size
++        self.embed_dim = config.embed_dim
++        self.config = config
++        self.proj = nn.Conv2d(
++            config.num_channels,
++            config.embed_dim,
++            kernel_size=(config.patch_size, config.patch_size),
++            stride=(config.patch_size, config.patch_size),
++        )
++        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
 +
-+template<>
-+inline float sum(Float4_ v) {
-+  return v.x.x() + v.x.y() + v.y.x() + v.y.y();
-+}
++    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
++        x = x.view(-1, self.num_channels, self.temporal_patch_size,
++                   self.patch_size, self.patch_size)[:, :, 0]
++        x = self.proj(x).view(-1, self.embed_dim)
++        x = self.norm(x)
++        return x
 +
-+template<>
-+inline float sum(Float8_ v) {
-+  return v.x.x() + v.x.y() + v.y.x() + v.y.y() + v.z.x() + v.z.y() + v.w.x() +
-+         v.w.y();
-+}
 +
-+// Vector dot product.
-+inline float dot(float a, float b) {
-+  return a * b;
-+}
++class DotsViTPreprocessor(nn.Module):
 +
-+inline float dot(sycl::float2 a, sycl::float2 b) {
-+  sycl::float2 c = mul<sycl::float2, sycl::float2, sycl::float2>(a, b);
-+  return c.x() + c.y();
-+}
++    def __init__(self, config):
++        super().__init__()
++        self.patch_h = config.patch_size
++        self.patch_w = config.patch_size
++        self.embed_dim = config.embed_dim
++        self.config = config
++        self.patchifier = DotsPatchEmbed(config)
 +
-+inline float dot(Float4_ a, Float4_ b) {
-+  sycl::float2 acc = mul<sycl::float2, sycl::float2, sycl::float2>(a.x, b.x);
-+  acc = fma(a.y, b.y, acc);
-+  return acc.x() + acc.y();
-+}
++    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
++        tokens = self.patchifier(x, grid_thw)
++        return tokens
 +
-+inline float dot(Float8_ a, Float8_ b) {
-+  sycl::float2 acc = mul<sycl::float2, sycl::float2, sycl::float2>(a.x, b.x);
-+  acc = fma(a.y, b.y, acc);
-+  acc = fma(a.z, b.z, acc);
-+  acc = fma(a.w, b.w, acc);
-+  return acc.x() + acc.y();
-+}
 +
-+// From float to float.
-+inline void from_float(float& dst, float src) {
-+  dst = src;
-+}
++class DotsVisionBlock(nn.Module):
 +
-+inline void from_float(sycl::float2 &dst, sycl::float2 src) {
-+  dst = src;
-+}
++    def __init__(self,
++                 config,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
 +
-+inline void from_float(sycl::float4 &dst, sycl::float4 src) {
-+  dst = src;
-+}
++        self.attn = DotsVisionAttention(
++            config,
++            config.embed_dim,
++            num_heads=config.num_attention_heads,
++            bias=config.use_bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
++        self.mlp = DotsSwiGLUFFN(config,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.mlp")
++        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
++
++    def forward(self,
++                hidden_states: torch.Tensor,
++                *,
++                cu_seqlens: torch.Tensor,
++                rotary_pos_emb: torch.Tensor,
++                max_seqlen: Optional[int] = None,
++                seqlens: Optional[list[int]] = None) -> torch.Tensor:
++        hidden_states = hidden_states + self.attn(
++            self.norm1(hidden_states),
++            cu_seqlens=cu_seqlens,
++            rotary_pos_emb=rotary_pos_emb,
++            max_seqlen=max_seqlen,
++            seqlens=seqlens,
++        )
++        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
++        return hidden_states
 +
-+// From float to float.
-+inline float to_float(float u) {
-+  return u;
-+}
 +
-+inline sycl::float2 to_float(sycl::float2 u) {
-+  return u;
-+}
++class DotsVisionTransformer(PreTrainedModel):
 +
-+inline sycl::float4 to_float(sycl::float4 u) {
-+  return u;
-+}
++    def __init__(
++        self,
++        config: DotsVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__(config)
++        self.config = config
++        self.spatial_merge_size = config.spatial_merge_size
++
++        self.patch_embed = DotsViTPreprocessor(config)
++
++        head_dim = config.embed_dim // config.num_attention_heads
++        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++                check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++        # Keep blocks for compatibility with other vision towers
++        num_layers = (config.num_hidden_layers if num_hidden_layers_override
++                      is None else num_hidden_layers_override)
++        self.blocks = nn.ModuleList([
++            DotsVisionBlock(config,
++                            quant_config=quant_config,
++                            prefix=f"{prefix}.blocks.{i}")
++            for i in range(num_layers)
++        ])
++        if require_post_norm is None:
++            require_post_norm = (len(self.blocks) == config.num_hidden_layers)
++        if require_post_norm and self.config.post_norm:
++            self.post_trunk_norm = RMSNorm(config.embed_dim,
++                                           eps=config.rms_norm_eps)
++        else:
++            self.post_trunk_norm = None
 +
-+inline Float4_ to_float(Float4_ u) {
-+  return u;
-+}
++        self.merger = PatchMerger(
++            dim=config.hidden_size,
++            context_dim=config.embed_dim,
++            spatial_merge_size=config.spatial_merge_size,
++        )
 +
-+inline Float8_ to_float(Float8_ u) {
-+  return u;
-+}
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.patchifier.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.patchifier.proj.weight.device
++
++    def get_pos_ids_by_grid(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(
++                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++
++        return pos_ids
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = self.get_pos_ids_by_grid(grid_thw)
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def compute_attn_mask_seqlen(
++            self, cu_seqlens: torch.Tensor
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
++
++    def forward(self, hidden_states: torch.Tensor,
++                grid_thw: torch.Tensor) -> torch.Tensor:
++        hidden_states = hidden_states.to(self.dtype)
++        hidden_states = self.patch_embed(hidden_states, grid_thw)
++
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
++                dim=0,
++                dtype=grid_thw.dtype
++                if torch.jit.is_tracing() else torch.int32,
++            )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 +
-+// Zero-out a variable.
-+inline void zero(float& dst) {
-+  dst = 0.f;
-+}
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
++        for blk in self.blocks:
++            hidden_states = blk(hidden_states,
++                                cu_seqlens=cu_seqlens,
++                                rotary_pos_emb=rotary_pos_emb,
++                                max_seqlen=max_seqlen,
++                                seqlens=seqlens)
 +
-+} // namespace vllm
-\ No newline at end of file
-diff --git a/csrc/xpu/fused_moe.cpp b/csrc/xpu/fused_moe.cpp
-new file mode 100644
-index 000000000..3a39d0e13
---- /dev/null
-+++ b/csrc/xpu/fused_moe.cpp
-@@ -0,0 +1,269 @@
-+#include "utils.h"
-+#include "base.hpp"
++        if self.post_trunk_norm is not None:
++            hidden_states = self.post_trunk_norm(hidden_states)
 +
-+using ST = at::ScalarType;
++        hidden_states = self.merger(hidden_states)
++        return hidden_states
 +
-+#include <sycl/sycl.hpp>
-+#include "xpu_types.h"
-+#include <torch/extension.h>
 +
-+template <typename T>
-+__inline__ T silu_xpu(const T& x) {
-+  // x * sigmoid(x)
-+  return (T)(((float)x) / (1.0f + sycl::exp((float)-x)));
-+}
++@MULTIMODAL_REGISTRY.register_processor(
++    Qwen2VLMultiModalProcessor,
++    info=DotsOCRProcessingInfo,
++    dummy_inputs=DotsOCRDummyInputsBuilder,
++)
++class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_substr={
++            ".attn.qkv_proj.": ".attn.qkv.",
++            ".attn.out_proj.": ".attn.proj.",
++        },
++        orig_to_new_prefix={
++            "lm_head.": "language_model.lm_head.",
++            "model.": "language_model.model.",
++        },
++    )
 +
-+template <typename scalar_t>
-+void silu_and_mul_kernel(
-+    scalar_t* __restrict__ out, // [..., d]
-+    const scalar_t* __restrict__ input, // [..., 2, d]
-+    const int d,
-+    const sycl::nd_item<3>& item_ct1) {
-+  const int64_t token_idx = item_ct1.get_group(2);
-+  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
-+       idx += item_ct1.get_local_range(2)) {
-+    const scalar_t x = input[token_idx * 2 * d + idx];
-+    const scalar_t y = input[token_idx * 2 * d + d + idx];
-+    out[token_idx * d + idx] = silu_xpu(x) * y;
-+  }
-+}
++    @classmethod
++    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
++        if modality.startswith("image"):
++            return "<|img|><|imgpad|><|endofimg|>"
 +
-+template <typename scalar_t>
-+void call_silu_and_mul_kernel(
-+    int num_tokens,
-+    int d,
-+    const scalar_t* __restrict__ input,
-+    scalar_t* __restrict__ output) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(d, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
-+          silu_and_mul_kernel<sycl_t>(
-+              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
-+        });
-+  });
-+}
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
 +
-+void _silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
-+  int num_tokens = input.numel() / input.size(-1);
-+  int d = input.size(-1) / 2;
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_silu_and_mul_kernel", [&] {
-+        call_silu_and_mul_kernel(
-+            num_tokens,
-+            d,
-+            input.data_ptr<scalar_t>(),
-+            out.data_ptr<scalar_t>());
-+      });
-+}
++        self.config: DotsOCRConfig = vllm_config.model_config.hf_config
++        self.quant_config = vllm_config.quant_config
++        self.multimodal_config = vllm_config.model_config.multimodal_config
 +
-+template <typename IT, const int VS, const int GS, const int ES, const int QTYPE>
-+static void moe_forward_kernel(
-+    const void* input_ptr,
-+    const int64_t* indexs,
-+    const uint64_t* qweights,
-+    void * output_ptr,
-+    const int num_tokens,
-+    const int state_size,
-+    const int output_size,
-+    at::Device device
-+) {
-+    static_assert(ES == 8 || ES == 16 || ES == 32);
-+    assert(output_size % VS == 0);
-+
-+    const int nb = state_size / QK;
-+    const int nsb = nb / SBS;
-+
-+    constexpr int BLOCK_SIZE = BLOCK_SIZES[QTYPE];
-+    constexpr int SCALE_SIZE = SCALE_SIZES[QTYPE];
-+
-+    sycl::range<2> global_size(num_tokens, output_size / VS * GS);
-+    sycl::range<2> local_size(1, GS);
-+
-+    auto cgf = [&](sycl::handler& handle) {
-+        handle.parallel_for(
-+            sycl::nd_range<2>(global_size, local_size),
-+            [=](sycl::nd_item<2> item) SYCL_ESIMD_KERNEL {
-+                slm_init<GS * VS * sizeof(float)>();
-+
-+                const int eid = item.get_global_id(0);
-+                const int tid = item.get_local_id(1);
-+                const int vid = item.get_group(1) * VS;
-+
-+                if (indexs[eid] >= 0) {
-+                    const uint8_t* weight = (const uint8_t *)(qweights[indexs[eid]]);
-+                    const uint8_t* scales = weight + (int64_t)output_size * nb * BLOCK_SIZE;
-+                    const IT* input = static_cast<const IT *>(input_ptr) + eid * state_size;
-+                    IT* output = static_cast<IT *>(output_ptr) + eid * output_size;
-+
-+                    const uint8_t * weight_base = weight + nb * BLOCK_SIZE * vid;
-+                    const uint8_t * scale_base = scales + nb * SCALE_SIZE * vid;
-+
-+                    simd<IT, VS * ES> accvs{};
-+
-+                    for (int s = tid; s < nsb; s += GS) {
-+                        simd<IT, SBS * QK> xvs = block_load<IT, SBS * QK>(input + s * SBS * QK);
-+
-+                        #pragma unroll
-+                        for (int v = 0; v < VS; ++v) {
-+                            simd<fp16, SBS * QK> yvs = load_qblocks<QTYPE>(
-+                                weight_base + v * nb * BLOCK_SIZE + s * SBS * BLOCK_SIZE,
-+                                scale_base + v * nb * SCALE_SIZE + s * SBS * SCALE_SIZE
-+                            );
-+
-+                            #pragma unroll
-+                            for (int i = 0; i < SBS * QK; i += ES) {
-+                                accvs.template select<ES, 1>(v * ES) +=
-+                                    xvs.template select<ES, 1>(i) *
-+                                    yvs.template select<ES, 1>(i);
-+                            }
-+                        }
-+                    }
-+
-+                    for (int b = nsb * SBS + tid; b < nb; b += GS) {
-+                        simd<IT, QK> xv = block_load<IT, QK>(input + b * QK);
-+
-+                        #pragma unroll
-+                        for (int v = 0; v < VS; ++v) {
-+                            simd<fp16, QK> yv = load_qblock<QTYPE>(
-+                                weight_base + v * nb * BLOCK_SIZE + b * BLOCK_SIZE,
-+                                scale_base + v * nb * SCALE_SIZE + b * SCALE_SIZE
-+                            );
-+
-+                            #pragma unroll
-+                            for (int i = 0; i < QK; i += ES) {
-+                                accvs.template select<ES, 1>(v * ES) +=
-+                                    xv.template select<ES, 1>(i) *
-+                                    yv.template select<ES, 1>(i);
-+                            }
-+                        }
-+                    }
-+
-+                    simd<float, VS> accs;
-+                    #pragma unroll
-+                    for(int v = 0; v < VS; ++v) {
-+                        accs[v] = sycl::ext::intel::esimd::detail::sum<float, IT, ES>(
-+                            accvs.template select<ES, 1>(v * ES)
-+                        );
-+                    }
-+
-+                    slm_block_store<float, VS>(tid * VS * sizeof(float), accs);
-+
-+                    barrier();
-+
-+                    if (tid == 0) {
-+                        #pragma unroll
-+                        for (int i = 1; i < GS; ++i) {
-+                            accs += slm_block_load<float, VS>(i * VS * sizeof(float));
-+                        }
-+
-+                        block_store<IT, VS>(output + vid, accs);
-+                    }
-+                }
-+
-+                
-+            }
-+        );
-+    };
++        if isinstance(self.config.vision_config, dict):
++            vision_config = DotsVisionConfig(**self.config.vision_config)
++            self.config.vision_config = vision_config
++        else:
++            vision_config = self.config.vision_config
 +
-+    utils::submit_kernel(cgf, device, "moe forward down kernel");
-+}
++        self.vision_tower = DotsVisionTransformer(
++            vision_config,
++            quant_config=self.quant_config,
++            prefix=maybe_prefix(prefix, "vision_tower"),
++        )
++        self.language_model: Qwen2ForCausalLM = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=self.config,
++            prefix=maybe_prefix(prefix, "language_model"),
++            architectures=["Qwen2ForCausalLM"],
++        )
 +
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            if mm_input.ndim == 2:
++                return mm_input
++            if mm_input.ndim != 3:
++                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
++                                 f"Got ndim: {mm_input.ndim} "
++                                 f"(shape={mm_input.shape})")
++            return torch.concat(list(mm_input))
++        else:
++            return torch.concat(mm_input)
 +
-+template <int QTYPE>
-+static auto dispatch_moe_forward(ST scalar_t) {
-+    switch (scalar_t) {
-+        case ST::Float: return std::make_tuple(moe_forward_kernel<float, 4, 4, 16, QTYPE>);
-+        case ST::Half: return std::make_tuple(moe_forward_kernel<fp16, 4, 4, 32, QTYPE>);
-+        default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported");
-+    }
-+}
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[DotsOCRImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++        image_grid_thw = kwargs.pop("image_grid_thw", None)
 +
++        if pixel_values is None and image_embeds is None:
++            return None
 +
-+torch::Tensor moe_forward(
-+    torch::Tensor input,
-+    torch::Tensor indexs,
-+    torch::Tensor qweights_attr,
-+    int64_t state_size,
-+    int64_t output_size,
-+    int64_t qtype
-+) {
-+    auto [func] = [&] () {
-+        switch (qtype) {
-+            case GGML_TYPE_Q4_0:
-+                return dispatch_moe_forward<GGML_TYPE_Q4_0>(input.scalar_type());
-+            case GGML_TYPE_Q4_0_WOQ:
-+                return dispatch_moe_forward<GGML_TYPE_Q4_0_WOQ>(input.scalar_type());
-+            case GGML_TYPE_FP8E5:
-+                return dispatch_moe_forward<GGML_TYPE_FP8E5>(input.scalar_type());
-+            default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype));
-+        }
-+    } ();
++        if pixel_values is not None:
++            pixel_values = self._validate_and_reshape_mm_tensor(
++                pixel_values, "image pixel values")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
 +
-+    int64_t num_tokens = indexs.numel();
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image pixel values. "
++                                 f"Got type: {type(pixel_values)}")
 +
-+    torch::Tensor output = torch::zeros({num_tokens, output_size},
-+                                    torch::device(input.device()).dtype(input.dtype()));
++            return DotsOCRImagePixelInputs(type="pixel_values",
++                                           pixel_values=pixel_values,
++                                           image_grid_thw=image_grid_thw)
 +
-+    func(
-+        input.data_ptr(), indexs.data_ptr<int64_t>(),
-+        qweights_attr.data_ptr<uint64_t>(), output.data_ptr(),
-+        num_tokens, state_size, output_size, input.device()
-+    );
++        if image_embeds is not None:
++            image_embeds = self._validate_and_reshape_mm_tensor(
++                image_embeds, "image embeds")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
 +
-+    return output;
-+}
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++            return DotsOCRImageEmbeddingInputs(type="image_embeds",
++                                               image_embeds=image_embeds,
++                                               image_grid_thw=image_grid_thw)
 +
++    def _process_image_input(
++            self, image_input: DotsOCRImageInputs) -> tuple[torch.Tensor, ...]:
++        grid_thw = image_input["image_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
 +
-+torch::Tensor fused_moe_forward(
-+    torch::Tensor input,
-+    torch::Tensor indexs,
-+    torch::Tensor qweights1_attr,
-+    torch::Tensor qweights2_attr,
-+    int64_t hidden_size,
-+    int64_t intermediate_size,
-+    int64_t qtype
-+) {
-+    auto [gmm_func] = [&] () {
-+        switch (qtype) {
-+            case GGML_TYPE_Q4_0:
-+                return dispatch_moe_forward<GGML_TYPE_Q4_0>(input.scalar_type());
-+            case GGML_TYPE_Q4_0_WOQ:
-+                return dispatch_moe_forward<GGML_TYPE_Q4_0_WOQ>(input.scalar_type());
-+            case GGML_TYPE_FP8E5:
-+                return dispatch_moe_forward<GGML_TYPE_FP8E5>(input.scalar_type());
-+            default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype));
-+        }
-+    } ();
++        if image_input["type"] == "image_embeds":
++            image_embeds = image_input["image_embeds"].type(
++                self.vision_tower.dtype)
++        else:
++            pixel_values = image_input["pixel_values"].type(
++                self.vision_tower.dtype)
++            image_embeds = self.vision_tower(
++                pixel_values, grid_thw)[:, :self.config.hidden_size]
 +
-+    int64_t num_tokens = indexs.numel();
++        # Split concatenated embeddings for each image item.
++        merge_size = self.vision_tower.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
 +
-+    torch::Tensor w1_output = torch::zeros({num_tokens, intermediate_size * 2},
-+                                    torch::device(input.device()).dtype(input.dtype()));
-+    
-+    torch::Tensor tmp = torch::zeros({num_tokens, intermediate_size},
-+                                    torch::device(input.device()).dtype(input.dtype()));
-+    
-+    torch::Tensor w2_output = torch::zeros({num_tokens, hidden_size},
-+                                    torch::device(input.device()).dtype(input.dtype()));
++        return image_embeds.split(sizes)
 +
-+    gmm_func(
-+        input.data_ptr(), indexs.data_ptr<int64_t>(),
-+        qweights1_attr.data_ptr<uint64_t>(), w1_output.data_ptr(),
-+        num_tokens, hidden_size, intermediate_size * 2, input.device()
-+    );
++    def get_language_model(self) -> torch.nn.Module:
++        return self.language_model
 +
-+    _silu_and_mul(tmp, w1_output);
++    def get_multimodal_embeddings(
++            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return []
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
 +
-+    gmm_func(
-+        tmp.data_ptr(), indexs.data_ptr<int64_t>(),
-+        qweights2_attr.data_ptr<uint64_t>(), w2_output.data_ptr(),
-+        num_tokens, intermediate_size, hidden_size, input.device()
-+    );
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids,
++                inputs_embeds,
++                multimodal_embeddings,
++                self.config.image_token_id,
++            )
 +
-+    return w2_output;
-+}
-diff --git a/csrc/xpu/gemm_kernels_xpu.cpp b/csrc/xpu/gemm_kernels_xpu.cpp
-new file mode 100644
-index 000000000..d96aa5880
---- /dev/null
-+++ b/csrc/xpu/gemm_kernels_xpu.cpp
-@@ -0,0 +1,125 @@
-+/*
-+Adapted from https://github.com/mit-han-lab/llm-awq
-+@article{lin2023awq,
-+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
-+Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
-+Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
-+}
-+ */
-+
-+#include <dpct/dpct.hpp>
-+#include <sycl/sycl.hpp>
-+#include <torch/extension.h>
-+//#include <c10/cuda/CUDAGuard.h>
-+#include "dequantize.h"
-+#include "utils.h"
-+#include "xpu_types.h"
-+
-+void awq_dequantize_impl(
-+    int* __restrict__ input,
-+    sycl::half* __restrict__ scaling_factors,
-+    int* __restrict__ zeros,
-+    sycl::half* __restrict__ output,
-+    int G,
-+    sycl::nd_item<3> item_ct1) {
-+  int j_factors1 = 4;
-+  int row_stride2 = 4;
-+  int split_k_iters = 1;
-+  sycl::half2 ZERO_HALF2{0, 0};
-+  sycl::half input_shared[8];
-+
-+  int N = item_ct1.get_local_range(2) * item_ct1.get_group_range(2);
-+  int col = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-+      item_ct1.get_local_id(2);
-+  int row = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
-+      item_ct1.get_local_id(1);
-+  int index1 = 8 * col + 8 * row * N;
-+  sycl::half* output_ptr2 = output + index1;
-+
-+  int index2 = col + row * N;
-+  int* input_ptr2 = input + index2;
-+
-+  int index3 = col + (int)(row / G) * N;
-+  int* zeros_ptr2 = zeros + index3;
-+  int index4 = 8 * col + (int)(row / G) * N * 8;
-+  sycl::half* scale_loaded = scaling_factors + index4;
-+
-+  uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
-+  sycl::uint4 zero_loaded_u4 = vllm::awq::dequantize_s4_to_fp16x2(zeros_loaded);
-+  // sycl::uint4 scale_loaded_u4 = *(sycl::uint4*)(scaling_factors_ptr2);
-+  // int j = 0;
-+
-+  uint32_t input_loaded = *(uint32_t*)(input_ptr2);
-+  sycl::uint4 input_loaded_fp16 =
-+      vllm::awq::dequantize_s4_to_fp16x2(input_loaded);
-+
-+  sycl::half2* input_loaded_h2 = (sycl::half2*)(&input_loaded_fp16);
-+  sycl::half2* zero_loaded_h2 = (sycl::half2*)(&zero_loaded_u4);
-+  sycl::half2* scale_loaded_h2 = (sycl::half2*)scale_loaded;
-+  for (int i = 0; i < 4; i++) {
-+    input_loaded_h2[i] = sycl_half_sub2(input_loaded_h2[i], zero_loaded_h2[i]);
-+    input_loaded_h2[i] =
-+        sycl_half_fma2(input_loaded_h2[i], scale_loaded_h2[i], ZERO_HALF2);
-+  }
-+  *(sycl::uint4*)(input_shared) = input_loaded_fp16;
-+
-+  for (int i = 0; i < 8; ++i) {
-+    *(output_ptr2 + i) = input_shared[i];
-+  }
-+}
++        return inputs_embeds
 +
-+torch::Tensor awq_dequantize(
-+    torch::Tensor _kernel,
-+    torch::Tensor _scaling_factors,
-+    torch::Tensor _zeros,
-+    int split_k_iters,
-+    int thx,
-+    int thy) {
-+  int in_c = _kernel.size(0);
-+  int qout_c = _kernel.size(1);
-+  int out_c = qout_c * 8;
-+  int G = in_c / _scaling_factors.size(0);
-+
-+  int x_thread = thx;
-+  int y_thread = thy;
-+
-+  int x_blocks = 1;
-+  int y_blocks = 1;
-+  if (thx == 0) {
-+    x_thread = qout_c;
-+  }
-+  if (thy == 0) {
-+    y_thread = in_c;
-+  }
-+  if (thx == 0 && thy == 0) {
-+    x_thread = 8;
-+    y_thread = 8;
-+    x_blocks = (int)(qout_c / 8);
-+    y_blocks = (int)(in_c / 8);
-+  }
-+
-+  auto options = torch::TensorOptions()
-+                     .dtype(_scaling_factors.dtype())
-+                     .device(_scaling_factors.device());
-+  at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
-+  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
-+  auto de_kernel =
-+      reinterpret_cast<sycl::half*>(_de_kernel.data_ptr<at::Half>());
-+  auto scaling_factors =
-+      reinterpret_cast<sycl::half*>(_scaling_factors.data_ptr<at::Half>());
-+  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
-+
-+  sycl::range<3> num_blocks(1, y_blocks, x_blocks);
-+  sycl::range<3> threads_per_block(1, y_thread, x_thread);
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+
-+  queue.submit([&](sycl::handler& cgh) {
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(num_blocks * threads_per_block, threads_per_block),
-+        [=](sycl::nd_item<3> item_ct1) {
-+          awq_dequantize_impl(
-+              kernel, scaling_factors, zeros, de_kernel, G, item_ct1);
-+        });
-+  });
-+  return _de_kernel;
-+}
-\ No newline at end of file
-diff --git a/csrc/xpu/kv.h b/csrc/xpu/kv.h
-new file mode 100644
-index 000000000..9616ad7ef
---- /dev/null
-+++ b/csrc/xpu/kv.h
-@@ -0,0 +1,76 @@
-+#pragma once
-+
-+#include <torch/extension.h>
-+#include <ext/intel/esimd.hpp>
-+
-+using fp16 = sycl::half;
-+
-+constexpr uint8_t FP16_EXP_OFFSET = 15;
-+constexpr uint8_t K_EXP_OFFSET = 9;
-+constexpr uint8_t V_EXP_OFFSET = 12;
-+constexpr uint8_t K_OFFSET = (FP16_EXP_OFFSET - K_EXP_OFFSET) << 3;
-+constexpr uint8_t V_OFFSET = (FP16_EXP_OFFSET - V_EXP_OFFSET) << 3;
-+constexpr uint16_t K_MAX =
-+    (uint16_t)0x3FC0 + ((uint16_t)(FP16_EXP_OFFSET - K_EXP_OFFSET) << 10);
-+constexpr uint16_t K_MIN =
-+    (uint16_t)0x0040 + ((uint16_t)(FP16_EXP_OFFSET - K_EXP_OFFSET) << 10);
-+constexpr uint16_t V_MAX =
-+    (uint16_t)0x3FC0 + ((uint16_t)(FP16_EXP_OFFSET - V_EXP_OFFSET) << 10);
-+constexpr uint16_t V_MIN =
-+    (uint16_t)0x0040 + ((uint16_t)(FP16_EXP_OFFSET - V_EXP_OFFSET) << 10);
-+
-+template <const int HD>
-+ESIMD_INLINE __ESIMD_NS::simd<uint8_t, HD> quantize_key_row(
-+    __ESIMD_NS::simd<fp16, HD> key_row) {
-+  const __ESIMD_NS::simd<fp16, HD> kmax = sycl::bit_cast<fp16, uint16_t>(K_MAX);
-+  const __ESIMD_NS::simd<fp16, HD> kmin = sycl::bit_cast<fp16, uint16_t>(K_MIN);
-+  __ESIMD_NS::simd<fp16, HD> key =
-+      __ESIMD_NS::max(__ESIMD_NS::min(__ESIMD_NS::abs(key_row), kmax), kmin);
-+  key.template bit_cast_view<uint16_t>() <<= 1;
-+  __ESIMD_NS::simd<uint8_t, HD> sign =
-+      key_row.template bit_cast_view<uint8_t>().template select<HD, 2>(1) &
-+      (uint8_t)0x80;
-+  return (key.template bit_cast_view<uint8_t>().template select<HD, 2>(1) -
-+          K_OFFSET) |
-+         sign;
-+}
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++        elif inputs_embeds is None and kwargs.get("pixel_values") is not None:
++            image_input = self._parse_and_validate_image_input(**kwargs)
++            if image_input is None:
++                inputs_embeds = None
++            else:
++                assert input_ids is not None
++                inputs_embeds = self.get_multimodal_embeddings(
++                    input_ids,
++                    image_input=image_input,
++                )
++                input_ids = None
 +
-+template <const int HD>
-+ESIMD_INLINE __ESIMD_NS::simd<uint8_t, HD> quantize_value_row(
-+    __ESIMD_NS::simd<fp16, HD> value_row) {
-+  const __ESIMD_NS::simd<fp16, HD> vmax = sycl::bit_cast<fp16, uint16_t>(V_MAX);
-+  const __ESIMD_NS::simd<fp16, HD> vmin = sycl::bit_cast<fp16, uint16_t>(V_MIN);
-+  __ESIMD_NS::simd<fp16, HD> value =
-+      __ESIMD_NS::max(__ESIMD_NS::min(__ESIMD_NS::abs(value_row), vmax), vmin);
-+  value.template bit_cast_view<uint16_t>() <<= 1;
-+  __ESIMD_NS::simd<uint8_t, HD> sign =
-+      value_row.template bit_cast_view<uint8_t>().template select<HD, 2>(1) &
-+      (uint8_t)0x80;
-+  return (value.template bit_cast_view<uint8_t>().template select<HD, 2>(1) -
-+          V_OFFSET) |
-+         sign;
-+}
++        hidden_states = self.language_model(
++            input_ids=input_ids,
++            positions=positions,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
 +
-+template <const int HD>
-+ESIMD_INLINE __ESIMD_NS::simd<fp16, HD> dequantize_key_row(
-+    const __ESIMD_NS::simd<uint8_t, HD>& key_row) {
-+  __ESIMD_NS::simd<uint16_t, HD> result = 0x80;
-+  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) =
-+      (key_row & (uint8_t)0x7F) + K_OFFSET;
-+  result >>= 1;
-+  __ESIMD_NS::simd<uint8_t, HD> sign = key_row & (uint8_t)0x80;
-+  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) |= sign;
-+  return result.template bit_cast_view<fp16>();
-+}
++        return hidden_states
 +
-+template <const int HD>
-+ESIMD_INLINE __ESIMD_NS::simd<fp16, HD> dequantize_value_row(
-+    const __ESIMD_NS::simd<uint8_t, HD>& value_row) {
-+  __ESIMD_NS::simd<uint16_t, HD> result = 0x80;
-+  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) =
-+      (value_row & (uint8_t)0x7F) + V_OFFSET;
-+  result >>= 1;
-+  __ESIMD_NS::simd<uint8_t, HD> sign = value_row & (uint8_t)0x80;
-+  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) |= sign;
-+  return result.template bit_cast_view<fp16>();
-+}
-\ No newline at end of file
-diff --git a/csrc/xpu/layernorm_xpu.cpp b/csrc/xpu/layernorm_xpu.cpp
-new file mode 100644
-index 000000000..9a6a2af0a
---- /dev/null
-+++ b/csrc/xpu/layernorm_xpu.cpp
-@@ -0,0 +1,188 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+#include <dpct/dpct.hpp>
-+
-+#include <torch/extension.h>
-+#include <algorithm>
-+#include "utils.h"
-+#include "xpu_types.h"
-+#include "reduction_utils.h"
-+
-+namespace vllm {
-+
-+template <typename scalar_t>
-+void rms_norm_kernel(
-+    scalar_t* __restrict__ out, // [..., hidden_size]
-+    const scalar_t* __restrict__ input, // [..., hidden_size]
-+    const scalar_t* __restrict__ weight, // [hidden_size]
-+    const float epsilon,
-+    const int num_tokens,
-+    const int hidden_size,
-+    const sycl::nd_item<3>& item_ct1,
-+    float* s_variance,
-+    float* shared_vals) {
-+  float variance = 0.0f;
-+
-+  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-+       idx += item_ct1.get_local_range(2)) {
-+    const float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
-+    variance += x * x;
-+  }
-+
-+  variance = blockReduceSum<float>(variance, item_ct1, shared_vals);
-+  if (item_ct1.get_local_id(2) == 0) {
-+    *s_variance = sycl::rsqrt(variance / hidden_size + epsilon);
-+  }
-+
-+  // item_ct1.barrier();
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-+       idx += item_ct1.get_local_range(2)) {
-+    float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
-+    out[item_ct1.get_group(2) * hidden_size + idx] =
-+        ((scalar_t)(x * (*s_variance))) * weight[idx];
-+  }
-+}
++    # def compute_logits(
++    #     self,
++    #     hidden_states: torch.Tensor,
++    # ) -> Optional[torch.Tensor]:
++    #     return self.language_model.compute_logits(hidden_states)
 +
-+template <typename scalar_t>
-+void call_rms_norm_kernel(
-+    torch::Tensor& out,
-+    torch::Tensor& input,
-+    torch::Tensor& weight,
-+    float epsilon) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int hidden_size = input.size(-1);
-+  int num_tokens = input.numel() / hidden_size;
-+  auto out_ptr = out.data_ptr<scalar_t>();
-+  auto input_ptr = input.data_ptr<scalar_t>();
-+  auto weight_ptr = weight.data_ptr<scalar_t>();
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(hidden_size, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    sycl::local_accessor<float, 1> shared_vals( sycl::range<1>(32), cgh);
-+    sycl::local_accessor<float, 1> s_variance( sycl::range<1>(1), cgh);
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block),
-+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-+          rms_norm_kernel<sycl_t>(
-+              (sycl_t*)out_ptr,
-+              (const sycl_t*)input_ptr,
-+              (const sycl_t*)weight_ptr,
-+              epsilon,
-+              num_tokens,
-+              hidden_size,
-+              item_ct1,
-+              s_variance.get_pointer(),
-+              shared_vals.get_pointer());
-+        });
-+  });
-+}
++    from vllm.v1.sample.metadata import SamplingMetadata
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: Optional[SamplingMetadata] = None,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states, sampling_metadata)
 +
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
+index 97aace5a2..bcff65a71 100644
+--- a/vllm/model_executor/models/ernie45_vl.py
++++ b/vllm/model_executor/models/ernie45_vl.py
+@@ -34,6 +34,7 @@ import torch.nn.functional as F
+ from einops import rearrange, repeat
+ from transformers import BatchFeature
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state
+ from vllm.distributed import utils as dist_utils
+@@ -170,7 +171,16 @@ class Ernie4_5_VisionAttention(nn.Module):
+                                       prefix=f"{prefix}.proj")
+ 
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
 +
-+template <typename scalar_t>
-+void fused_add_rms_norm_kernel(
-+    scalar_t* __restrict__ input,   // [..., hidden_size]
-+    scalar_t* __restrict__ residual,        // [..., hidden_size]
-+    const scalar_t* __restrict__ weight, // [hidden_size]
-+    const float epsilon,
-+    const int num_tokens,
-+    const int hidden_size,
-+    const sycl::nd_item<3>& item_ct1,
-+    float* s_variance,
-+    float* shared_vals) {
-+  float variance = 0.0f;
-+
-+  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-+       idx += item_ct1.get_local_range(2)) {
-+    float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
-+    x+=(float)residual[item_ct1.get_group(2) * hidden_size + idx];
-+    variance += x * x;
-+    residual[item_ct1.get_group(2) * hidden_size + idx] = (scalar_t)x;
-+  }
-+
-+  variance = blockReduceSum<float>(variance, item_ct1, shared_vals);
-+  if (item_ct1.get_local_id(2) == 0) {
-+    *s_variance = sycl::rsqrt(variance / hidden_size + epsilon);
-+  }
-+
-+  // item_ct1.barrier();
-+  item_ct1.barrier(sycl::access::fence_space::local_space);
-+
-+  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-+       idx += item_ct1.get_local_range(2)) {
-+    float x = (float)residual[item_ct1.get_group(2) * hidden_size + idx];
-+    input[item_ct1.get_group(2) * hidden_size + idx] =
-+        ((scalar_t)(x * (*s_variance))) * weight[idx];
-+  }
-+}
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
 +
-+template <typename scalar_t>
-+void call_fused_add_rms_norm_kernel(
-+    torch::Tensor& input,
-+    torch::Tensor& residual,
-+    torch::Tensor& weight,
-+    float epsilon){
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  int hidden_size = input.size(-1);
-+  int num_tokens = input.numel() / hidden_size;
-+  auto input_ptr = input.data_ptr<scalar_t>();
-+  auto residual_ptr = residual.data_ptr<scalar_t>();
-+  auto weight_ptr = weight.data_ptr<scalar_t>();
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(hidden_size, 1024));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  queue.submit([&](sycl::handler& cgh) {
-+    sycl::local_accessor<float, 1> shared_vals( sycl::range<1>(32), cgh);
-+    sycl::local_accessor<float, 1> s_variance( sycl::range<1>(1), cgh);
-+    cgh.parallel_for(
-+        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(32)]] {
-+          fused_add_rms_norm_kernel<sycl_t>(
-+              (sycl_t*)input_ptr,
-+              (sycl_t*)residual_ptr,
-+              (const sycl_t*)weight_ptr,
-+              epsilon,
-+              num_tokens,
-+              hidden_size,
-+              item_ct1,
-+              s_variance.get_pointer(),
-+              shared_vals.get_pointer());
-+        });
-+  });
-+}
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                 _Backend.ROCM_AITER_FA
+@@ -233,7 +243,10 @@ class Ernie4_5_VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
+ 
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+ 
+@@ -457,7 +470,11 @@ class Ernie4_5_VisionTransformer(nn.Module):
+                 ), "vit's config.hidden must be equal to config.embed_dim"
+         self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
+ 
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++        check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
+index 539381b61..279f458df 100644
+--- a/vllm/model_executor/models/glm4_1v.py
++++ b/vllm/model_executor/models/glm4_1v.py
+@@ -44,6 +44,7 @@ from transformers.models.glm4v.video_processing_glm4v import (
+     Glm4vVideoProcessor)
+ from transformers.video_utils import VideoMetadata
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import (get_tensor_model_parallel_world_size,
+                               parallel_state)
+@@ -260,7 +261,15 @@ class Glm4vVisionAttention(nn.Module):
+         )
+ 
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
 +
-+} // namespace vllm
-+
-+void rms_norm(
-+    torch::Tensor& out,
-+    torch::Tensor& input,
-+    torch::Tensor& weight,
-+    float epsilon) {
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_rms_norm_kernel", [&] {
-+        vllm::call_rms_norm_kernel<scalar_t>(out, input, weight, epsilon);
-+      });
-+}
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN,
+                 _Backend.TORCH_SDPA,
+@@ -310,7 +319,10 @@ class Glm4vVisionAttention(nn.Module):
+         if self.attn_backend == _Backend.FLASH_ATTN:
+             # from vllm_flash_attn.flash_attn_interface import (
+             #   flash_attn_varlen_func)
+-            from flash_attn import flash_attn_varlen_func
++            if self.use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
+ 
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+ 
+@@ -715,7 +727,11 @@ class Glm4vVisionTransformer(nn.Module):
+         self.post_layernorm = RMSNorm(vision_config.hidden_size,
+                                       eps=vision_config.rms_norm_eps)
+ 
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
+index e0b4df772..d85d30d91 100644
+--- a/vllm/model_executor/models/gpt_oss.py
++++ b/vllm/model_executor/models/gpt_oss.py
+@@ -311,9 +311,6 @@ class GptOssModel(nn.Module):
+             if is_pp_missing_parameter(name, self):
+                 continue
+ 
+-            # FIXME(woosuk): Remove this after testing.
+-            weight = weight.cuda()
+-
+             if ".w13_weight_scale" in name:
+                 # Handle MLP gate and up projection weights scale
+                 if use_ep:
+diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
+index 710b805ac..04824db1b 100644
+--- a/vllm/model_executor/models/keye.py
++++ b/vllm/model_executor/models/keye.py
+@@ -17,6 +17,7 @@ from transformers.modeling_outputs import (BaseModelOutput,
+                                            BaseModelOutputWithPooling)
+ from transformers.utils import torch_int
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import get_tensor_model_parallel_world_size
+ from vllm.logger import init_logger
+@@ -374,7 +375,16 @@ class KeyeSiglipAttention(nn.Module):
+         )
+ 
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.head_dim, dtype=torch.get_default_dtype())
 +
-+void fused_add_rms_norm(
-+    torch::Tensor& input,
-+    torch::Tensor& residual,
-+    torch::Tensor& weight,
-+    float epsilon) {
-+  int hidden_size = input.size(-1);
-+  int num_tokens = input.numel() / hidden_size;
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      input.scalar_type(), "call_fused_add_rms_norm_kernel", [&] {
-+        vllm::call_fused_add_rms_norm_kernel<scalar_t>(
-+            input,
-+            residual,
-+            weight,
-+               epsilon);
-+      });
-+}
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
 +
-diff --git a/csrc/xpu/pos_encoding_xpu.cpp b/csrc/xpu/pos_encoding_xpu.cpp
-new file mode 100644
-index 000000000..3232cacbc
---- /dev/null
-+++ b/csrc/xpu/pos_encoding_xpu.cpp
-@@ -0,0 +1,333 @@
-+// clang-format off
-+#ifdef VLLM_DEV
-+#undef __SYCL_DEVICE_ONLY__
-+#endif
-+#include <sycl/sycl.hpp>
-+// clang-format on
-+#include "xpu_types.h"
-+
-+#include <torch/extension.h>
-+#include "utils.h"
-+
-+template <typename scalar_t, bool IS_NEOX>
-+inline void apply_rotary_embedding(
-+    scalar_t* __restrict__ arr,
-+    const scalar_t* __restrict__ cos_ptr,
-+    const scalar_t* __restrict__ sin_ptr,
-+    int rot_offset,
-+    int embed_dim) {
-+  int x_index, y_index;
-+  scalar_t cos, sin;
-+  if (IS_NEOX) {
-+    // GPT-NeoX style rotary embedding.
-+    x_index = rot_offset;
-+    y_index = embed_dim + rot_offset;
-+    cos = VLLM_LDG(cos_ptr + x_index);
-+    sin = VLLM_LDG(sin_ptr + x_index);
-+  } else {
-+    // GPT-J style rotary embedding.
-+    x_index = 2 * rot_offset;
-+    y_index = 2 * rot_offset + 1;
-+    cos = VLLM_LDG(cos_ptr + x_index / 2);
-+    sin = VLLM_LDG(sin_ptr + x_index / 2);
-+  }
-+
-+  const scalar_t x = arr[x_index];
-+  const scalar_t y = arr[y_index];
-+  arr[x_index] = x * cos - y * sin;
-+  arr[y_index] = y * cos + x * sin;
-+}
-+
-+template <typename scalar_t, bool IS_NEOX>
-+void rotary_embedding_kernel(
-+    const int64_t* __restrict__ positions, // [batch_size, seq_len] or
-+                                           // [num_tokens]
-+    scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size]
-+                                  // or [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
-+                                // head_size] or [num_tokens, num_kv_heads,
-+                                // head_size]
-+    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
-+                                                // 2]
-+    const int rot_dim,
-+    const int query_stride,
-+    const int key_stride,
-+    const int num_heads,
-+    const int num_kv_heads,
-+    const int head_size,
-+    const sycl::nd_item<3>& item_ct1) {
-+  // Each thread block is responsible for one token.
-+  const int token_idx = item_ct1.get_group(2);
-+  int64_t pos = positions[token_idx];
-+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
-+
-+  const int embed_dim = rot_dim / 2;
-+  const scalar_t* cos_ptr = cache_ptr;
-+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
-+
-+  const int nq = num_heads * embed_dim;
-+  for (int i = item_ct1.get_local_id(2); i < nq;
-+       i += item_ct1.get_local_range(2)) {
-+    const int head_idx = i / embed_dim;
-+    const int token_head = token_idx * query_stride + head_idx * head_size;
-+    const int rot_offset = i % embed_dim;
-+    apply_rotary_embedding<scalar_t, IS_NEOX>(
-+        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
-+  }
-+
-+  const int nk = num_kv_heads * embed_dim;
-+  for (int i = item_ct1.get_local_id(2); i < nk;
-+       i += item_ct1.get_local_range(2)) {
-+    const int head_idx = i / embed_dim;
-+    const int token_head = token_idx * key_stride + head_idx * head_size;
-+    const int rot_offset = i % embed_dim;
-+    apply_rotary_embedding<scalar_t, IS_NEOX>(
-+        key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
-+  }
-+}
-+
-+template <typename scalar_t, bool IS_NEOX>
-+void batched_rotary_embedding_kernel(
-+    const int64_t* __restrict__ positions, // [batch_size, seq_len] or
-+                                           // [num_tokens]
-+    scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size]
-+                                  // or [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
-+                                // head_size] or [num_tokens, num_kv_heads,
-+                                // head_size]
-+    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
-+                                                // 2]
-+    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
-+    const int rot_dim,
-+    const int query_stride,
-+    const int key_stride,
-+    const int num_heads,
-+    const int num_kv_heads,
-+    const int head_size,
-+    const sycl::nd_item<3>& item_ct1) {
-+  // Each thread block is responsible for one token.
-+  const int token_idx = item_ct1.get_group(2);
-+  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
-+  int64_t pos = positions[token_idx];
-+  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
-+
-+  const int embed_dim = rot_dim / 2;
-+  const scalar_t* cos_ptr = cache_ptr;
-+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
-+
-+  const int nq = num_heads * embed_dim;
-+  for (int i = item_ct1.get_local_id(2); i < nq;
-+       i += item_ct1.get_local_range(2)) {
-+    const int head_idx = i / embed_dim;
-+    const int token_head = token_idx * query_stride + head_idx * head_size;
-+    const int rot_offset = i % embed_dim;
-+    apply_rotary_embedding<scalar_t, IS_NEOX>(
-+        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
-+  }
-+
-+  const int nk = num_kv_heads * embed_dim;
-+  for (int i = item_ct1.get_local_id(2); i < nk;
-+       i += item_ct1.get_local_range(2)) {
-+    const int head_idx = i / embed_dim;
-+    const int token_head = token_idx * key_stride + head_idx * head_size;
-+    const int rot_offset = i % embed_dim;
-+    apply_rotary_embedding<scalar_t, IS_NEOX>(
-+        key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
-+  }
-+}
-+
-+template <typename scalar_t>
-+void call_rotary_embedding_kernel(
-+    const int64_t* __restrict__ positions, // [num_tokens]
-+    scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size]
-+    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
-+                                                // 2]
-+    const int rot_dim,
-+    const int query_stride,
-+    const int key_stride,
-+    const int num_heads,
-+    const int num_kv_heads,
-+    const int head_size,
-+    const int num_tokens,
-+    const int sin_cos_dim,
-+    bool is_neox) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(num_heads * rot_dim / 2, 512));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  if (is_neox) {
-+    queue.submit([&](sycl::handler& cgh) {
-+      cgh.parallel_for(
-+          sycl::nd_range<3>(grid * block, block),
-+          [=](sycl::nd_item<3> item_ct1) {
-+            rotary_embedding_kernel<sycl_t, true>(
-+                positions,
-+                (sycl_t* __restrict__)query,
-+                (sycl_t* __restrict__)key,
-+                (const sycl_t* __restrict__)cos_sin_cache,
-+                rot_dim,
-+                query_stride,
-+                key_stride,
-+                num_heads,
-+                num_kv_heads,
-+                head_size,
-+                item_ct1);
-+          });
-+    });
-+  } else {
-+    queue.submit([&](sycl::handler& cgh) {
-+      cgh.parallel_for(
-+          sycl::nd_range<3>(grid * block, block),
-+          [=](sycl::nd_item<3> item_ct1) {
-+            rotary_embedding_kernel<sycl_t, false>(
-+                positions,
-+                (sycl_t* __restrict__)query,
-+                (sycl_t* __restrict__)key,
-+                (const sycl_t* __restrict__)cos_sin_cache,
-+                rot_dim,
-+                query_stride,
-+                key_stride,
-+                num_heads,
-+                num_kv_heads,
-+                head_size,
-+                item_ct1);
-+          });
-+    });
-+  }
-+}
-+
-+template <typename scalar_t>
-+void call_batched_rotary_embedding_kernel(
-+    const int64_t* __restrict__ positions, // [num_tokens]
-+    scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size]
-+    scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size]
-+    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
-+                                                // 2]
-+    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
-+    const int rot_dim,
-+    const int query_stride,
-+    const int key_stride,
-+    const int num_heads,
-+    const int num_kv_heads,
-+    const int head_size,
-+    const int num_tokens,
-+    const int sin_cos_dim,
-+    bool is_neox) {
-+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
-+  sycl::range<3> grid(1, 1, num_tokens);
-+  sycl::range<3> block(1, 1, std::min(num_heads * rot_dim / 2, 512));
-+  auto& queue = vllm::xpu::vllmGetQueue();
-+  if (is_neox) {
-+    queue.submit([&](sycl::handler& cgh) {
-+      cgh.parallel_for(
-+          sycl::nd_range<3>(grid * block, block),
-+          [=](sycl::nd_item<3> item_ct1) {
-+            batched_rotary_embedding_kernel<sycl_t, true>(
-+                positions,
-+                (sycl_t* __restrict__)query,
-+                (sycl_t* __restrict__)key,
-+                (const sycl_t* __restrict__)cos_sin_cache,
-+                cos_sin_cache_offsets,
-+                rot_dim,
-+                query_stride,
-+                key_stride,
-+                num_heads,
-+                num_kv_heads,
-+                head_size,
-+                item_ct1);
-+          });
-+    });
-+  } else {
-+    queue.submit([&](sycl::handler& cgh) {
-+      cgh.parallel_for(
-+          sycl::nd_range<3>(grid * block, block),
-+          [=](sycl::nd_item<3> item_ct1) {
-+            batched_rotary_embedding_kernel<sycl_t, false>(
-+                positions,
-+                (sycl_t* __restrict__)query,
-+                (sycl_t* __restrict__)key,
-+                (const sycl_t* __restrict__)cos_sin_cache,
-+                cos_sin_cache_offsets,
-+                rot_dim,
-+                query_stride,
-+                key_stride,
-+                num_heads,
-+                num_kv_heads,
-+                head_size,
-+                item_ct1);
-+          });
-+    });
-+  }
-+}
-+
-+void rotary_embedding(
-+    torch::Tensor& positions,
-+    torch::Tensor& query,
-+    torch::Tensor& key,
-+    int head_size,
-+    torch::Tensor& cos_sin_cache,
-+    bool is_neox) {
-+
-+  int num_tokens = query.numel() / query.size(-1);
-+  int rot_dim = cos_sin_cache.size(1);
-+  int num_heads = query.size(-1) / head_size;
-+  int num_kv_heads = key.size(-1) / head_size;
-+  int key_stride = key.stride(-2);
-+  int query_stride = query.stride(-2);
-+  int cos_sin_dim = cos_sin_cache.size(0);
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+      query.scalar_type(), "call_rotary_embedding_kernel", [&] {
-+        call_rotary_embedding_kernel<scalar_t>(
-+            positions.data_ptr<int64_t>(),
-+            query.data_ptr<scalar_t>(),
-+            key.data_ptr<scalar_t>(),
-+            cos_sin_cache.data_ptr<scalar_t>(),
-+            rot_dim,
-+            query_stride,
-+            key_stride,
-+            num_heads,
-+            num_kv_heads,
-+            head_size,
-+            num_tokens,
-+            cos_sin_dim,
-+            is_neox);
-+      });
-+}
-+
-+void batched_rotary_embedding(
-+  torch::Tensor& positions,
-+  torch::Tensor& query,
-+  torch::Tensor& key,
-+  int head_size,
-+  torch::Tensor& cos_sin_cache,
-+  bool is_neox,
-+  int rot_dim,
-+  torch::Tensor& cos_sin_cache_offsets) {
-+  int64_t num_tokens = cos_sin_cache_offsets.size(0);
-+  int num_heads = query.size(-1) / head_size;
-+  int num_kv_heads = key.size(-1) / head_size;
-+  int key_stride = key.stride(-2);
-+  int query_stride = query.stride(-2);
-+  int cos_sin_dim = cos_sin_cache.size(0);
-+
-+  VLLM_XPU_DISPATCH_FLOATING_TYPES(
-+    query.scalar_type(), "call_batched_rotary_embedding_kernel", [&] {
-+      call_batched_rotary_embedding_kernel<scalar_t>(
-+          positions.data_ptr<int64_t>(),
-+          query.data_ptr<scalar_t>(),
-+          key.data_ptr<scalar_t>(),
-+          cos_sin_cache.data_ptr<scalar_t>(),
-+          cos_sin_cache_offsets.data_ptr<int64_t>(),
-+          rot_dim,
-+          query_stride,
-+          key_stride,
-+          num_heads,
-+          num_kv_heads,
-+          head_size,
-+          num_tokens,
-+          cos_sin_dim,
-+          is_neox);
-+    });
-+}
-\ No newline at end of file
-diff --git a/csrc/xpu/pybind.cpp b/csrc/xpu/pybind.cpp
-new file mode 100644
-index 000000000..bf9e94612
---- /dev/null
-+++ b/csrc/xpu/pybind.cpp
-@@ -0,0 +1,112 @@
-+// #include "cache.h"
-+#include "xpu_ops.h"
-+#include <torch/extension.h>
-+
-+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-+  // vLLM custom ops
-+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-+
-+  // Attention ops
-+  ops.def(
-+    "paged_attention_v1",
-+    &paged_attention_v1,
-+    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
-+  ops.def(
-+    "paged_attention_v2",
-+    &paged_attention_v2,
-+    "PagedAttention V2.");
-+
-+  ops.def("context_attention_forward_v1", &context_attention_forward_v1,
-+          "Context attention forward_v1");
-+
-+  ops.def("context_attention_forward_v2", &context_attention_forward_v2,
-+          "Context attention forward_v2");
-+
-+  ops.def(
-+    "paged_attention_gqa",
-+    &paged_attention_gqa,
-+    "PagedAttention GQA.");
-+
-+  ops.def("paged_attention_gqa_fp8", &paged_attention_gqa_fp8, "PagedAttention GQA fp8.");
-+
-+  // Activation ops
-+  ops.def(
-+    "silu_and_mul",
-+    &silu_and_mul,
-+    "Activation function used in SwiGLU.");
-+  ops.def(
-+    "gelu_and_mul",
-+    &gelu_and_mul,
-+    "Activation function used in GeGLU with `none` approximation.");
-+  ops.def(
-+    "gelu_tanh_and_mul",
-+    &gelu_tanh_and_mul,
-+    "Activation function used in GeGLU with `tanh` approximation.");
-+  ops.def(
-+    "gelu_new",
-+    &gelu_new,
-+    "GELU implementation used in GPT-2.");
-+  ops.def(
-+    "gelu_fast",
-+    &gelu_fast,
-+    "Approximate GELU implementation.");
-+
-+  // Layernorm
-+  ops.def(
-+    "rms_norm",
-+    &rms_norm,
-+    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-+
-+  ops.def(
-+    "fused_add_rms_norm",
-+    &fused_add_rms_norm,
-+    "In-place fused Add and RMS Normalization");
-+
-+  // Rotary embedding
-+  ops.def(
-+    "rotary_embedding",
-+    &rotary_embedding,
-+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-+
-+  // Cache ops
-+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-+  cache_ops.def(
-+    "swap_blocks",
-+    &swap_blocks,
-+    "Swap in (out) the cache blocks from src to dst");
-+  cache_ops.def(
-+    "copy_blocks",
-+    &copy_blocks,
-+    "Copy the cache blocks from src to dst");
-+  cache_ops.def(
-+    "reshape_and_cache",
-+    &reshape_and_cache,
-+    "Reshape the key and value tensors and cache them");
-+  cache_ops.def(
-+    "reshape_and_cache_ipexllm",
-+    &reshape_and_cache_ipexllm,
-+    "Reshape the key and value tensors and cache them for ipex_llm");
-+
-+  cache_ops.def(
-+    "reshape_and_cache_ipexllm_fp8",
-+    &reshape_and_cache_ipexllm_fp8,
-+    "Reshape the key and value tensors and cache them for ipex_llm with fp8");
-+
-+  // Quant
-+  ops.def(
-+    "awq_dequantize",
-+    &awq_dequantize,
-+    "dequant method for awq");
-+
-+
-+  ops.def(
-+    "moe_forward",
-+    &moe_forward,
-+    "PagedAttention GQA.");
-+
-+  ops.def(
-+    "fused_moe_forward",
-+    &fused_moe_forward,
-+    "PagedAttention GQA.");
-+
-+}
-diff --git a/csrc/xpu/reduction_utils.h b/csrc/xpu/reduction_utils.h
-new file mode 100644
-index 000000000..93c64d759
---- /dev/null
-+++ b/csrc/xpu/reduction_utils.h
-@@ -0,0 +1,56 @@
-+/*
-+ * Copyright (c) 2023, The vLLM team.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+#pragma once
-+
-+#include <dpct/dpct.hpp>
-+#include <stdint.h>
-+#include <sycl/sycl.hpp>
-+
-+namespace vllm {
-+
-+template <typename T>
-+__inline__ T warpReduceSum(T val, const sycl::nd_item<3>& item_ct1) {
-+#pragma unroll
-+  for (int mask = 16; mask > 0; mask >>= 1)
-+    val += dpct::permute_sub_group_by_xor(
-+        item_ct1.get_sub_group(), val, mask, 32);
-+  return val;
-+}
-+
-+/* Calculate the sum of all elements in a block */
-+template<typename T>
-+__inline__ T blockReduceSum(T val, const sycl::nd_item<3> &item_ct1, T *shared) {
-+
-+  int lane = item_ct1.get_local_id(2) & 0x1f;
-+  int wid = item_ct1.get_local_id(2) >> 5;
-+
-+  val = warpReduceSum<T>(val, item_ct1);
-+
-+  if (lane == 0) {
-+    shared[wid] = val;
-+  }
-+  item_ct1.barrier();
-+
-+  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
-+  // blockDim.x is not divided by 32
-+  val = (item_ct1.get_local_id(2) < (item_ct1.get_local_range(2) / 32.f))
-+            ? shared[lane]
-+            : (T)(0.0f);
-+  val = warpReduceSum<T>(val, item_ct1);
-+  return val;
-+}
-+
-+} // namespace vllm
-\ No newline at end of file
-diff --git a/csrc/xpu/utils.cpp b/csrc/xpu/utils.cpp
-new file mode 100644
-index 000000000..5f613af55
---- /dev/null
-+++ b/csrc/xpu/utils.cpp
-@@ -0,0 +1,34 @@
-+#include "utils.h"
-+#include <sycl/ext/intel/math.hpp>
-+
-+sycl::half sycl_half_mul(sycl::half a, sycl::half b) {
-+  return sycl::ext::intel::math::hmul(a, b);
-+}
-+sycl::half sycl_half_add(sycl::half a, sycl::half b) {
-+  return sycl::ext::intel::math::hadd(a, b);
-+}
-+sycl::half sycl_half_sub(sycl::half a, sycl::half b) {
-+  return sycl::ext::intel::math::hsub(a, b);
-+}
-+sycl::half sycl_half_fma(sycl::half a, sycl::half b, sycl::half c) {
-+  return sycl::ext::intel::math::hfma(a, b, c);
-+}
-+
-+sycl::half2 sycl_half_mul2(sycl::half2 a, sycl::half2 b) {
-+  return sycl::ext::intel::math::hmul2(a, b);
-+}
-+sycl::half2 sycl_half_add2(sycl::half2 a, sycl::half2 b) {
-+  return sycl::ext::intel::math::hadd2(a, b);
-+}
-+sycl::half2 sycl_half_sub2(sycl::half2 a, sycl::half2 b) {
-+  return sycl::ext::intel::math::hsub2(a, b);
-+}
-+
-+sycl::half2 sycl_half_fma2(sycl::half2 a, sycl::half2 b, sycl::half2 c) {
-+  return sycl::ext::intel::math::hfma2(a, b, c);
-+}
-+
-+int get_max_shared_memory_per_block_device_attribute(int device_id) {
-+  const sycl::device& device = vllm::xpu::vllmGetQueue().get_device();
-+  return device.get_info<sycl::info::device::local_mem_size>();
-+}
-diff --git a/csrc/xpu/utils.h b/csrc/xpu/utils.h
-new file mode 100644
-index 000000000..fa3ead51c
---- /dev/null
-+++ b/csrc/xpu/utils.h
-@@ -0,0 +1,82 @@
-+#pragma once
-+
-+#include <sycl/sycl.hpp>
-+#include <functional>
-+#include <memory>
-+// #include <ipex.h>
-+#include <ATen/ATen.h>
-+#include <torch/torch.h>
-+
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+#include <c10/xpu/XPUStream.h>
-+#endif
-+
-+
-+#define VLLM_LDG(arg) *(arg)
-+namespace vllm {
-+namespace xpu {
-+
-+static inline sycl::queue& vllmGetQueue() {
-+  auto device_type = c10::DeviceType::XPU;
-+  c10::impl::VirtualGuardImpl impl(device_type);
-+  c10::Stream c10_stream = impl.getStream(c10::Device(device_type));
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+  return at::xpu::XPUStream(c10_stream).queue();
-+#else
-+  return ::xpu::get_queue_from_stream(c10_stream);
-+#endif
-+}
-+template <typename T>
-+struct SyclTypeTrait{
-+  using Type = T;
-+};
-+
-+template <>
-+struct SyclTypeTrait<c10::Half>{
-+  using Type = sycl::half;
-+};
-+
-+template <>
-+struct SyclTypeTrait<c10::BFloat16>{
-+  using Type = sycl::ext::oneapi::bfloat16;
-+};
-+
-+
-+} // namespace xpu
-+
-+} // namespace vllm
-+
-+SYCL_EXTERNAL sycl::half sycl_half_mul(sycl::half a, sycl::half b);
-+SYCL_EXTERNAL sycl::half sycl_half_add(sycl::half a, sycl::half b);
-+SYCL_EXTERNAL sycl::half sycl_half_sub(sycl::half a, sycl::half b);
-+SYCL_EXTERNAL sycl::half sycl_half_fma(sycl::half a, sycl::half b, sycl::half c);
-+
-+SYCL_EXTERNAL sycl::half2 sycl_half_mul2(sycl::half2 a, sycl::half2 b);
-+SYCL_EXTERNAL sycl::half2 sycl_half_add2(sycl::half2 a, sycl::half2 b);
-+SYCL_EXTERNAL sycl::half2 sycl_half_sub2(sycl::half2 a, sycl::half2 b);
-+SYCL_EXTERNAL sycl::half2 sycl_half_fma2(sycl::half2 a, sycl::half2 b, sycl::half2 c);
-+
-+int get_max_shared_memory_per_block_device_attribute(int device_id);
-+
-+namespace utils {
-+    static inline sycl::queue& get_queue(const at::Device& device) {
-+        c10::impl::VirtualGuardImpl impl(device.type());
-+        c10::Stream c10_stream = impl.getStream(c10::Device(device));
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+        return at::xpu::XPUStream(c10_stream).queue();
-+#else
-+        return ::xpu::get_queue_from_stream(c10_stream);
-+#endif
-+    }
-+
-+    static inline sycl::event submit_kernel(std::function<void(sycl::handler&)> kernel, const at::Device& device, const char * desc) {
-+        sycl::queue& queue = get_queue(device);
-+        sycl::event event = queue.submit(kernel);
-+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
-+        // xpu::profiler_record(desc, event);
-+#else
-+        ::xpu::profiler_record(desc, event);
-+#endif
-+        return event;
-+    }
-+}
-diff --git a/csrc/xpu/xpu_ops.h b/csrc/xpu/xpu_ops.h
-new file mode 100644
-index 000000000..603d4f23d
---- /dev/null
-+++ b/csrc/xpu/xpu_ops.h
-@@ -0,0 +1,194 @@
-+#pragma once
-+#include <torch/extension.h>
-+
-+void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
-+                          torch::Tensor &key, int head_size,
-+                          torch::Tensor &cos_sin_cache, bool is_neox);
-+void batched_rotary_embedding(
-+  torch::Tensor& positions,
-+  torch::Tensor& query,
-+  torch::Tensor& key,
-+  int head_size,
-+  torch::Tensor& cos_sin_cache,
-+  bool is_neox,
-+  int rot_dim,
-+  torch::Tensor& cos_sin_cache_offsets);
-+
-+void silu_and_mul(torch::Tensor &out, torch::Tensor &input);
-+void gelu_and_mul(torch::Tensor &out, torch::Tensor &input);
-+
-+void gelu_new(torch::Tensor &out, torch::Tensor &input);
-+
-+void gelu_fast(torch::Tensor &out, torch::Tensor &input);
-+
-+
-+void gelu_tanh_and_mul(
-+  torch::Tensor& out,
-+  torch::Tensor& input);
-+
-+void paged_attention_v1(
-+    torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
-+    torch::Tensor &value_cache, int num_kv_heads, float scale,
-+    torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
-+    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes,
-+    const std::string& kv_cache_dtype, const float kv_scale, const float attn_logit_softcapping);
-+
-+void paged_attention_v2(
-+    torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
-+    torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
-+    torch::Tensor &value_cache, int num_kv_heads, float scale,
-+    torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
-+    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes,
-+    const std::string& kv_cache_dtype, const float kv_scale, const float attn_logit_softcapping);
-+
-+torch::Tensor context_attention_forward_v1(
-+    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
-+    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor block_tables, torch::Tensor query_start_loc,
-+    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
-+    int max_context_length);
-+
-+torch::Tensor context_attention_forward_v2(
-+    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
-+    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
-+    torch::Tensor block_tables, torch::Tensor query_start_loc,
-+    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
-+    int max_context_length, int max_q_length);
-+
-+void copy_blocks(
-+    std::vector<torch::Tensor> &key_caches,
-+    std::vector<torch::Tensor> &value_caches,
-+    const std::map<int64_t, std::vector<int64_t>> &block_mapping);
-+
-+void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
-+                           torch::Tensor &key_cache, torch::Tensor &value_cache,
-+                           torch::Tensor &slot_mapping,
-+                           const std::string& kv_cache_dtype, const float kv_scale);
-+void reshape_and_cache_ipexllm(torch::Tensor &key, torch::Tensor &value,
-+                           torch::Tensor &key_cache, torch::Tensor &value_cache,
-+                           torch::Tensor &slot_mapping,
-+                           const std::string& kv_cache_dtype, const float kv_scale);
-+
-+void reshape_and_cache_ipexllm_fp8(torch::Tensor& key, torch::Tensor& value,
-+                                   torch::Tensor& key_cache,
-+                                   torch::Tensor& value_cache,
-+                                   torch::Tensor& slot_mapping,
-+                                   const std::string& kv_cache_dtype,
-+                                   const float kv_scale);
-+
-+void moe_align_block_size(
-+  torch::Tensor topk_ids,
-+  int num_experts,
-+  int block_size,
-+  torch::Tensor sorted_token_ids,
-+  torch::Tensor experts_ids,
-+  torch::Tensor num_tokens_post_pad) {
-+  TORCH_CHECK(false, "moe_align_block_size is not supported on XPU.");
-+}
-+void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
-+                     const std::map<int64_t, int64_t> &block_mapping);
-+
-+void gather_cached_kv(torch::Tensor &key, torch::Tensor &value,
-+                          torch::Tensor &key_cache, torch::Tensor &value_cache,
-+                          torch::Tensor &slot_mapping);
-+
-+void convert_fp8_e5m2(torch::Tensor& src_cache, torch::Tensor& dst_cache) {
-+  TORCH_CHECK(false, "Quantization is not supported on XPU.");
-+}
-+
-+void rms_norm(torch::Tensor &out, torch::Tensor &input,
-+                  torch::Tensor &weight, float epsilon);
-+
-+void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
-+                            torch::Tensor &weight, float epsilon);
-+
-+torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
-+                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
-+                       int split_k_iters) {
-+  TORCH_CHECK(false, "awq_gemm is not supported on XPU.");                            
-+}
-+
-+torch::Tensor marlin_gemm(
-+    torch::Tensor& a, 
-+    torch::Tensor& b_q_weight,
-+    torch::Tensor& b_scales, 
-+    torch::Tensor& workspace,
-+    int64_t size_m, 
-+    int64_t size_n, 
-+    int64_t size_k) {
-+  TORCH_CHECK(false, "marlin_gemm is not supported on XPU.");                            
-+}
-+
-+torch::Tensor awq_dequantize(torch::Tensor _kernel, 
-+    torch::Tensor _scaling_factors,
-+    torch::Tensor _zeros,
-+    int split_k_iters,
-+    int thx,
-+    int thy);
-+
-+void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat,
-+                         torch::Tensor mul, torch::Tensor lookup_table) {
-+  TORCH_CHECK(false, "squeezellm_gemm is not supported on XPU.");
-+}
-+
-+torch::Tensor gptq_gemm(
-+  torch::Tensor a,
-+  torch::Tensor b_q_weight,
-+  torch::Tensor b_gptq_qzeros,
-+  torch::Tensor b_gptq_scales,
-+  torch::Tensor b_g_idx,
-+  bool use_exllama,
-+  int bit) {
-+  TORCH_CHECK(false, "gptq_gemm is not supported on XPU.");
-+}
-+
-+void gptq_shuffle(
-+  torch::Tensor q_weight,
-+  torch::Tensor q_perm,
-+  int bit) {
-+  TORCH_CHECK(false, "gptq_shuffle is not supported on XPU.");
-+}
-+
-+void paged_attention_gqa(
-+    torch::Tensor output,
-+    torch::Tensor query,
-+    torch::Tensor key_cache,
-+    torch::Tensor value_cache,
-+    int64_t bsz,
-+    int64_t num_heads,
-+    int64_t num_kv_heads,
-+    float scale,
-+    torch::Tensor& block_tables,
-+    torch::Tensor& context_lens,
-+    int block_size,
-+    int64_t head_dim,
-+    int max_seq_len
-+);
-+
-+
-+torch::Tensor moe_forward(
-+    torch::Tensor input,
-+    torch::Tensor indexs,
-+    torch::Tensor qweights_attr,
-+    int64_t state_size,
-+    int64_t output_size,
-+    int64_t qtype
-+);
-+
-+torch::Tensor fused_moe_forward(
-+    torch::Tensor input,
-+    torch::Tensor indexs,
-+    torch::Tensor qweights1_attr,
-+    torch::Tensor qweights2_attr,
-+    int64_t hidden_size,
-+    int64_t intermediate_size,
-+    int64_t qtype
-+);
-+void paged_attention_gqa_fp8(torch::Tensor output, torch::Tensor query,
-+                         torch::Tensor key_cache, torch::Tensor value_cache,
-+                         int64_t bsz, int64_t num_heads, int64_t num_kv_heads,
-+                         float scale, torch::Tensor& block_tables,
-+                         torch::Tensor& context_lens, int block_size,
-+                         int64_t head_dim, int max_seq_len);
-diff --git a/csrc/xpu/xpu_types.h b/csrc/xpu/xpu_types.h
-new file mode 100644
-index 000000000..23f5b805c
---- /dev/null
-+++ b/csrc/xpu/xpu_types.h
-@@ -0,0 +1,25 @@
-+
-+#ifndef XPU_TYPES_H
-+#define XPU_TYPES_H
-+
-+#include <torch/extension.h>
-+
-+// FIXME: FP16 is not fully supported in Torch-CPU
-+#define VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES(...)     \
-+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
-+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-+
-+#define VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES_FLOAT_ONLY(...) \
-+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)        \
-+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
-+
-+#define VLLM_XPU_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-+  AT_DISPATCH_SWITCH(                                     \
-+      TYPE, NAME, VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-+
-+#define VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(TYPE, NAME, ...) \
-+  AT_DISPATCH_SWITCH(                                     \
-+      TYPE, NAME, VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES_FLOAT_ONLY(__VA_ARGS__))
-+
-+#endif
-\ No newline at end of file
-diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
-index 7d5a589eb..25a9fd7cd 100644
---- a/docker/Dockerfile.xpu
-+++ b/docker/Dockerfile.xpu
-@@ -1,9 +1,10 @@
--# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
--FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
-+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
- 
--RUN rm /etc/apt/sources.list.d/intel-graphics.list
-+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-+    add-apt-repository -y ppa:kobuk-team/intel-graphics
+         if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
+             raise RuntimeError(
+                 f"Keye-VL does not support {self.attn_backend} backend now.")
+@@ -428,7 +438,10 @@ class KeyeSiglipAttention(nn.Module):
+             )
  
--RUN apt-get update -y && \
-+RUN apt clean && apt-get update -y && \
-     apt-get install -y --no-install-recommends --fix-missing \
-     curl \
-     ffmpeg \
-@@ -14,15 +15,29 @@ RUN apt-get update -y && \
-     libgl1 \
-     lsb-release \
-     numactl \
--    python3 \
--    python3-dev \
--    python3-pip \
--    wget
-+    wget \
-+    vim \
-+    python3.12 \
-+    python3.12-dev \
-+    python3-pip
-+
-+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
-+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
-+
-+RUN apt install -y libze1=1.23.1-1~24.04~ppa1 libze-dev=1.23.1-1~24.04~ppa1 libze-intel-gpu1=25.27.34303.9-1~24.04~ppa1 intel-opencl-icd=25.27.34303.9-1~24.04~ppa1 libze-intel-gpu-raytracing=1.1.0-114~u24.04
-+
-+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
-+SHELL ["bash", "-c"]
-+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
+         if self.attn_backend == _Backend.FLASH_ATTN:
+-            from flash_attn import flash_attn_varlen_func
++            if self.use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
  
- WORKDIR /workspace/vllm
- COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
- COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
-+# suppress the python externally managed environment error
-+RUN python3 -m pip config set global.break-system-packages true
-+
- RUN --mount=type=cache,target=/root/.cache/pip \
-     pip install --no-cache-dir \
-     -r requirements/xpu.txt
-@@ -47,10 +62,11 @@ FROM vllm-base AS vllm-openai
+diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
+index a1c452053..a74e8cdb7 100644
+--- a/vllm/model_executor/models/phi4mm_audio.py
++++ b/vllm/model_executor/models/phi4mm_audio.py
+@@ -550,10 +550,11 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
+         enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                   self.chunk_size,
+                                                   self.left_chunk)
+-
+-        if xs_pad.is_cuda:
+-            enc_streaming_mask = enc_streaming_mask.cuda()
+-            xs_pad = xs_pad.cuda()
++        
++        device = xs_pad.device
++        if device.type != "cpu":
++            enc_streaming_mask = enc_streaming_mask.to(device)
++            xs_pad = xs_pad.to(device)
  
- # install additional dependencies for openai api server
- RUN --mount=type=cache,target=/root/.cache/pip \
--    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
-+    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] 'modelscope!=1.15.0'
-+
-+RUN --mount=type=cache,target=/root/.cache/pip \
-+    pip uninstall oneccl oneccl-devel -y
+         input_tensor = xs_pad
+         input_tensor, masks = self._forward_embeddings_core(
+@@ -570,8 +571,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
+         if chunk_size_nc is not None:
+             enc_streaming_mask_nc = self._streaming_mask(
+                 seq_len, batch_size, chunk_size_nc, left_chunk_nc)
+-            if xs_pad.is_cuda:
+-                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
++            if device.type != "cpu":
++                enc_streaming_mask_nc = enc_streaming_mask_nc.to(device)
+             if masks is not None:
+                 hs_mask_nc = masks & enc_streaming_mask_nc
+             else:
+diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
+index 54dc0bebd..e13e87b93 100644
+--- a/vllm/model_executor/models/qwen2.py
++++ b/vllm/model_executor/models/qwen2.py
+@@ -285,7 +285,7 @@ class Qwen2Model(nn.Module):
+                  decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
+         super().__init__()
  
--ENV VLLM_USAGE_SOURCE production-docker-image \
--    TRITON_XPU_PROFILE 1
- # install development dependencies (for testing)
- RUN python3 -m pip install -e tests/vllm_test_utils
- ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
-index 0661933ac..469d88a05 100644
---- a/docs/features/quantization/fp8.md
-+++ b/docs/features/quantization/fp8.md
-@@ -134,4 +134,4 @@ print(result[0].outputs[0].text)
- ```
+-        config = vllm_config.model_config.hf_config
++        config = vllm_config.model_config.hf_config.get_text_config()
+         cache_config = vllm_config.cache_config
+         quant_config = vllm_config.quant_config
  
- !!! warning
--    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-+    Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device.
-diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
-index c8b6c6c86..404045306 100644
---- a/docs/models/supported_models.md
-+++ b/docs/models/supported_models.md
-@@ -592,7 +592,8 @@ Specified using `--task generate`.
- | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
- | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
- | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
--| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
- | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
- | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-@@ -602,7 +603,7 @@ Specified using `--task generate`.
- | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
- | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
- | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
--| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
-+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
- | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
- | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
-@@ -646,6 +647,15 @@ Specified using `--task generate`.
+diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
+index 8aa777557..429516cce 100644
+--- a/vllm/model_executor/models/qwen2_5_vl.py
++++ b/vllm/model_executor/models/qwen2_5_vl.py
+@@ -38,6 +38,7 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
  
-     This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state
+ from vllm.distributed import utils as dist_utils
+@@ -298,10 +299,19 @@ class Qwen2_5_VisionAttention(nn.Module):
+                                       disable_tp=use_data_parallel)
  
-+!!! note
-+    `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
-+    MobileNet-v5 vision backbone.
-+  
-+    Performance is not yet fully optimized mainly due to:
-+  
-+    - Both audio and vision MM encoders use `transformers.AutoModel` implementation.  
-+    - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
 +
- !!! note
-     Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+-                _Backend.ROCM_AITER_FA
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
+         }:
+             raise RuntimeError(
+                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
+@@ -359,7 +369,10 @@ class Qwen2_5_VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
  
-diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
-new file mode 100644
-index 000000000..aec3481d2
---- /dev/null
-+++ b/examples/offline_inference/basic/reward.py
-@@ -0,0 +1,55 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+
-+from argparse import Namespace
-+
-+from vllm import LLM, EngineArgs
-+from vllm.utils import FlexibleArgumentParser
-+
-+
-+def parse_args():
-+    parser = FlexibleArgumentParser()
-+    parser = EngineArgs.add_cli_args(parser)
-+    # Set example specific arguments
-+    parser.set_defaults(
-+        model="internlm/internlm2-1_8b-reward",
-+        #runner="pooling",
-+        task="reward",
-+        enforce_eager=True,
-+        max_model_len=1024,
-+        trust_remote_code=True,
-+    )
-+    return parser.parse_args()
-+
-+
-+def main(args: Namespace):
-+    # Sample prompts.
-+    prompts = [
-+        "Hello, my name is",
-+        "The president of the United States is",
-+        "The capital of France is",
-+        "The future of AI is",
-+    ]
-+
-+    # Create an LLM.
-+    # You should pass runner="pooling" for reward models
-+    llm = LLM(**vars(args))
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+ 
+@@ -376,6 +389,38 @@ class Qwen2_5_VisionAttention(nn.Module):
+             context_layer = rearrange(output,
+                                       "(b s) ... -> b s ...",
+                                       b=batch_size)
++        elif self.attn_backend == _Backend.IPEX:
++            from vllm._ipex_ops import ipex_ops
 +
-+    # Generate rewards. The output is a list of PoolingRequestOutput.
-+    outputs = llm.reward(prompts)
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 +
-+    # Print the outputs.
-+    print("\nGenerated Outputs:\n" + "-" * 60)
-+    for prompt, output in zip(prompts, outputs):
-+        rewards = output.outputs.data
-+        rewards_trimmed = (
-+            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
-+        )
-+        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
-+        print("-" * 60)
-+
-+
-+if __name__ == "__main__":
-+    args = parse_args()
-+    main(args)
-+
-diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
-index f0c00bcaa..c8fa36295 100644
---- a/examples/offline_inference/multilora_inference.py
-+++ b/examples/offline_inference/multilora_inference.py
-@@ -30,7 +30,7 @@ def create_test_prompts(
-         (
-             "A robot may not injure a human being",
-             SamplingParams(
--                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-+                temperature=0.0, logprobs=1, max_tokens=128
-             ),
-             None,
-         ),
-@@ -46,7 +46,7 @@ def create_test_prompts(
-             SamplingParams(
-                 temperature=0.0,
-                 logprobs=1,
--                prompt_logprobs=1,
-+                #prompt_logprobs=1,
-                 max_tokens=128,
-                 stop_token_ids=[32003],
-             ),
-@@ -57,7 +57,7 @@ def create_test_prompts(
-             SamplingParams(
-                 temperature=0.0,
-                 logprobs=1,
--                prompt_logprobs=1,
-+                #prompt_logprobs=1,
-                 max_tokens=128,
-                 stop_token_ids=[32003],
-             ),
-@@ -99,14 +99,14 @@ def initialize_engine() -> LLMEngine:
-     #   numbers will cause higher memory usage. If you know that all LoRAs will
-     #   use the same rank, it is recommended to set this as low as possible.
-     # max_cpu_loras: controls the size of the CPU LoRA cache.
--    engine_args = EngineArgs(
--        model="meta-llama/Llama-2-7b-hf",
--        enable_lora=True,
--        max_loras=1,
--        max_lora_rank=8,
--        max_cpu_loras=2,
--        max_num_seqs=256,
--    )
-+    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
-+                             enable_lora=True,
-+                             max_loras=1,
-+                             max_lora_rank=8,
-+                             max_cpu_loras=2,
-+                             max_num_seqs=256,
-+                             enforce_eager=True,
-+                             block_size=64)
-     return LLMEngine.from_engine_args(engine_args)
++            output = torch.empty(
++                q.shape,
++                dtype=q.dtype,
++                device=q.device)
++            ipex_ops.varlen_attention(
++                    q,
++                    k,
++                    v,
++                    output,
++                    cu_seqlens,
++                    cu_seqlens,
++                    None,
++                    max_seqlen,
++                    max_seqlen,
++                    pdropout=0.0,
++                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
++                    zero_tensors=False,
++                    is_causal=False,
++                    return_softmax=False,
++                    gen_=None,
++                    window_size_left=-1,
++                    window_size_right=-1,
++                    logits_soft_cap=-1,
++            )
++            context_layer = rearrange(output,
++                            "(b s) ... -> b s ...",
++                            b=batch_size)
+         elif self.attn_backend == _Backend.TORCH_SDPA:
+             # Execute attention entry by entry for speed & less VRAM.
+             outputs = []
+@@ -628,7 +673,12 @@ class Qwen2_5_VisionTransformer(nn.Module):
+             prefix=f"{prefix}.merger",
+             use_data_parallel=use_data_parallel,
+         )
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -714,6 +764,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
+             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         elif self.attn_backend == _Backend.XFORMERS:
+             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        elif self.attn_backend == _Backend.IPEX:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         return max_seqlen, seqlens
+ 
+     @staticmethod
+@@ -1210,10 +1262,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+             if image_input is None and video_input is None:
+                 inputs_embeds = None
+             else:
+-                if uses_mrope(self.config):
+-                    assert positions.ndim == 2 and positions.size(0) == 3, (
+-                        "multimodal section rotary embedding requires "
+-                        f"(3, seq_len) positions, but got {positions.size()}")
++                # if uses_mrope(self.config):
++                #     assert positions.ndim == 2 and positions.size(0) == 3, (
++                #         "multimodal section rotary embedding requires "
++                #         f"(3, seq_len) positions, but got {positions.size()}")
+                 inputs_embeds = self.get_input_embeddings_v0(
+                     input_ids,
+                     image_input=image_input,
+diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
+index 90a1ad2a6..e6da04df4 100644
+--- a/vllm/model_executor/models/qwen2_vl.py
++++ b/vllm/model_executor/models/qwen2_vl.py
+@@ -41,6 +41,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+ from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
+     Qwen2VLVideoProcessor)
  
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+ from vllm.distributed import utils as dist_utils
+@@ -82,7 +83,7 @@ from .vision import get_vit_attn_backend
+ logger = init_logger(__name__)
  
-diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
-index 4fdc7a3cf..b6007b9f4 100644
---- a/examples/offline_inference/prithvi_geospatial_mae.py
-+++ b/examples/offline_inference/prithvi_geospatial_mae.py
-@@ -3,12 +3,12 @@
- import argparse
- import datetime
- import os
--import re
- from typing import Union
+ # For profile run
+-_MAX_FRAMES_PER_VIDEO = 16
++_MAX_FRAMES_PER_VIDEO = 600
  
- import albumentations
- import numpy as np
- import rasterio
-+import regex as re
- import torch
- from einops import rearrange
- from terratorch.datamodules import Sen1Floods11NonGeoDataModule
-diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
-index e4811c023..fe4393bcf 100644
---- a/examples/offline_inference/vision_language.py
-+++ b/examples/offline_inference/vision_language.py
-@@ -389,6 +389,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
-     )
+ # === Vision Inputs === #
  
+@@ -314,10 +315,19 @@ class Qwen2VisionAttention(nn.Module):
+                                       prefix=f"{prefix}.proj")
  
-+# Intern-S1
-+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-+    model_name = "internlm/Intern-S1"
-+
-+    engine_args = EngineArgs(
-+        model=model_name,
-+        trust_remote_code=True,
-+        max_model_len=8192,
-+        max_num_seqs=2,
-+        limit_mm_per_prompt={modality: 1},
-+        enforce_eager=True,
-+    )
-+
-+    if modality == "image":
-+        placeholder = "<IMG_CONTEXT>"
-+    elif modality == "video":
-+        placeholder = "<video>"
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
 +
-+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-+    messages = [
-+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-+        for question in questions
-+    ]
-+    prompts = tokenizer.apply_chat_template(
-+        messages, tokenize=False, add_generation_prompt=True
-+    )
-+
-+    return ModelRequestData(
-+        engine_args=engine_args,
-+        prompts=prompts,
-+    )
-+
-+
- # InternVL
- def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
-     model_name = "OpenGVLab/InternVL3-2B"
-@@ -1080,7 +1113,9 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
-             "max_pixels": 1280 * 28 * 28,
-             "fps": 1,
-         },
--        limit_mm_per_prompt={modality: 1},
-+        limit_mm_per_prompt={"image": 1},
-+        enforce_eager=True,
-+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-     )
- 
-     if modality == "image":
-diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
-index eb4f3b6c8..ca31dc3e0 100644
---- a/examples/offline_inference/vision_language_multi_image.py
-+++ b/examples/offline_inference/vision_language_multi_image.py
-@@ -799,12 +799,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-         )
-         smart_resize = None
- 
--    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-+    model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
- 
-     engine_args = EngineArgs(
-         model=model_name,
-         max_model_len=32768 if smart_resize is None else 4096,
--        max_num_seqs=5,
-+        max_num_seqs=2,
-+        enforce_eager=True,
-+        gpu_memory_utilization=0.8,
-         limit_mm_per_prompt={"image": len(image_urls)},
-     )
- 
-diff --git a/requirements/tpu.txt b/requirements/tpu.txt
-index 354771482..d86f643d3 100644
---- a/requirements/tpu.txt
-+++ b/requirements/tpu.txt
-@@ -10,6 +10,7 @@ jinja2>=3.1.6
- ray[default]
- ray[data]
- setuptools==78.1.0
-+nixl==0.3.0
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+-                _Backend.ROCM_AITER_FA
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
+         }:
+             raise RuntimeError(
+                 f"Qwen2-VL does not support {self.attn_backend} backend now.")
+@@ -374,7 +384,10 @@ class Qwen2VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
  
- # Install torch_xla
- --pre
-diff --git a/requirements/xpu.txt b/requirements/xpu.txt
-index 0d95dc571..170c09928 100644
---- a/requirements/xpu.txt
-+++ b/requirements/xpu.txt
-@@ -11,14 +11,10 @@ jinja2>=3.1.6
- datasets # for benchmark scripts
- numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
--torch==2.7.0+xpu
+@@ -391,6 +404,38 @@ class Qwen2VisionAttention(nn.Module):
+             context_layer = rearrange(output,
+                                       "(b s) ... -> b s ...",
+                                       b=batch_size)
++        elif self.attn_backend == _Backend.IPEX:
++            from vllm._ipex_ops import ipex_ops
 +
-+torch == 2.8.0
- torchaudio
- torchvision
--pytorch-triton-xpu
- --extra-index-url=https://download.pytorch.org/whl/xpu
- 
--# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
--# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
--intel-extension-for-pytorch==2.7.10+xpu
--oneccl_bind_pt==2.7.0+xpu
----extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
-diff --git a/run_benchmark_with_profile.sh b/run_benchmark_with_profile.sh
-new file mode 100644
-index 000000000..fe4dbc268
---- /dev/null
-+++ b/run_benchmark_with_profile.sh
-@@ -0,0 +1,3 @@
-+export VLLM_TORCH_PROFILER_DIR=$PWD/profile
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 +
-+VLLM_USE_V1=1 python3 benchmarks/benchmark_throughput.py --model facebook/opt-125m --dataset_name random  --enforce-eager --max-num-seqs 32 --gpu-memory-util 0.8 --num-prompts 16 --max-model-len 2000 --input-len 1024 --output-len 10 --max-num-batched-tokens 32768  --disable-sliding-window --dtype float16 --profile
-\ No newline at end of file
-diff --git a/setup.py b/setup.py
-index d46e678e7..9951f49bc 100644
---- a/setup.py
-+++ b/setup.py
-@@ -148,6 +148,7 @@ class cmake_build_ext(build_ext):
-         cmake_args = [
-             '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
-+            "-DCMAKE_CXX_STANDARD=17",
-         ]
- 
-         verbose = envs.VERBOSE
-@@ -442,7 +443,7 @@ def _is_xpu() -> bool:
- 
- 
- def _build_custom_ops() -> bool:
--    return _is_cuda() or _is_hip() or _is_cpu()
-+    return _is_cuda() or _is_hip() or _is_cpu() or _is_xpu()
- 
- 
- def get_rocm_version():
-diff --git a/tests/conftest.py b/tests/conftest.py
-index a18dbf58c..fd4956bdb 100644
---- a/tests/conftest.py
-+++ b/tests/conftest.py
-@@ -1062,8 +1062,17 @@ class VllmRunner:
-         return [req_output.outputs.score for req_output in req_outputs]
- 
-     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
--        executor = self.llm.llm_engine.model_executor
--        return executor.apply_model(func)
-+        if hasattr(self.llm.llm_engine, "model_executor"):
-+            # This works either in V0 or in V1 with
-+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
-+            executor = self.llm.llm_engine.model_executor
-+            return executor.apply_model(func)
-+
-+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
-+        def _apply_model(self):
-+            return func(self.get_model())
-+
-+        return self.llm.llm_engine.collective_rpc(_apply_model)
- 
-     def __enter__(self):
-         return self
-diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
-index aae9a4d1e..667a63e76 100644
---- a/tests/model_executor/test_model_load_with_params.py
-+++ b/tests/model_executor/test_model_load_with_params.py
-@@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_model_loading_with_params(vllm_runner):
-+def test_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test parameter weight loading with tp>1.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     with vllm_runner(model_name=MODEL_NAME,
-                      revision=REVISION,
-                      dtype="float16",
-@@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_roberta_model_loading_with_params(vllm_runner):
-+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test parameter weight loading with tp>1.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
-                      revision=REVISION_ROBERTA,
-                      dtype="float16",
-@@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_facebook_roberta_model_loading_with_params(vllm_runner):
-+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test loading roberta-base model with no lm_head.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     model_name = "FacebookAI/roberta-base"
-     with vllm_runner(model_name=model_name,
-                      dtype="float16",
-diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
-index cc9e4102d..ba42e389f 100644
---- a/tests/models/language/pooling/test_embedding.py
-+++ b/tests/models/language/pooling/test_embedding.py
-@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
-         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
-         # [Encoder-only]
--        pytest.param(
--            "BAAI/bge-base-en-v1.5",
--            marks=[
--                # CPU only supports V1
--                pytest.mark.core_model,
--                pytest.mark.skip_v1
--            ]),
--        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
--                     marks=[pytest.mark.skip_v1]),
--        pytest.param("intfloat/multilingual-e5-small",
--                     marks=[pytest.mark.skip_v1]),
-+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
-+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-+        pytest.param("intfloat/multilingual-e5-small"),
-         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                      marks=[pytest.mark.skip_v1]),
-         # [Cross-Encoder]
-diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
-index 16c711407..a4681baa5 100644
---- a/tests/models/language/pooling/test_jina.py
-+++ b/tests/models/language/pooling/test_jina.py
-@@ -23,6 +23,14 @@ RERANK_MODELS = [
- ]
++            output = torch.empty(
++                q.shape,
++                dtype=q.dtype,
++                device=q.device)
++            ipex_ops.varlen_attention(
++                    q,
++                    k,
++                    v,
++                    output,
++                    cu_seqlens,
++                    cu_seqlens,
++                    None,
++                    max_seqlen,
++                    max_seqlen,
++                    pdropout=0.0,
++                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
++                    zero_tensors=False,
++                    is_causal=False,
++                    return_softmax=False,
++                    gen_=None,
++                    window_size_left=-1,
++                    window_size_right=-1,
++                    logits_soft_cap=-1,
++            )
++            context_layer = rearrange(output,
++                            "(b s) ... -> b s ...",
++                            b=batch_size)
+         elif self.attn_backend == _Backend.TORCH_SDPA:
+             # Execute attention entry by entry for speed & less VRAM.
+             outputs = []
+@@ -628,7 +673,12 @@ class Qwen2VisionTransformer(nn.Module):
+             quant_config=quant_config,
+             prefix=f"{prefix}.merger",
+         )
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -672,6 +722,8 @@ class Qwen2VisionTransformer(nn.Module):
+             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         elif self.attn_backend == _Backend.XFORMERS:
+             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        elif self.attn_backend == _Backend.IPEX:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         return max_seqlen, seqlens
  
+     def forward(
+diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
+index 85429b3a0..0a504d90c 100644
+--- a/vllm/model_executor/models/qwen3_moe.py
++++ b/vllm/model_executor/models/qwen3_moe.py
+@@ -378,7 +378,7 @@ class Qwen3MoeModel(nn.Module):
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
  
-+@pytest.fixture(autouse=True)
-+def v1(run_with_both_engines):
-+    # Simple autouse wrapper to run both engines for each test
-+    # This can be promoted up to conftest.py to run for every
-+    # test in a package
-+    pass
-+
-+
- @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
- def test_embed_models_mteb(hf_runner, vllm_runner,
-                            model_info: EmbedModelInfo) -> None:
-diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
-index fd5842523..321a156a6 100644
---- a/tests/models/multimodal/processing/test_common.py
-+++ b/tests/models/multimodal/processing/test_common.py
-@@ -272,10 +272,15 @@ def _test_processing_correctness_one(
-     "THUDM/GLM-4.1V-9B-Thinking",
-     "ibm-granite/granite-speech-3.3-2b",
-     "h2oai/h2ovl-mississippi-800m",
-+    "internlm/Intern-S1",
-     "OpenGVLab/InternVL2-1B",
-     "OpenGVLab/InternVL3-1B",
-     "HuggingFaceM4/Idefics3-8B-Llama3",
-     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-+    "OpenGVLab/InternVL3_5-1B",
-+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
-+    "OpenGVLab/InternVL3_5-30B-A3B",
-+    "Kwai-Keye/Keye-VL-8B-Preview",
-     "moonshotai/Kimi-VL-A3B-Instruct",
-     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-     "llava-hf/llava-1.5-7b-hf",
-diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
-new file mode 100644
-index 000000000..2d8cd49ed
+-        config = vllm_config.model_config.hf_config
++        config = vllm_config.model_config.hf_config.get_text_config()
+         cache_config = vllm_config.cache_config
+         quant_config = vllm_config.quant_config
+         parallel_config = vllm_config.parallel_config
+diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+new file mode 100755
+index 000000000..d388da846
 --- /dev/null
-+++ b/tests/models/multimodal/processing/test_tensor_schema.py
-@@ -0,0 +1,264 @@
++++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+@@ -0,0 +1,1490 @@
 +# SPDX-License-Identifier: Apache-2.0
 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+from collections.abc import Iterable
++# Copyright 2025 The Qwen team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen3-Omni-Moe model (thinker part)."""
++
++from collections.abc import Iterable, Mapping, Sequence
 +from functools import partial
-+from typing import Any, Union
-+from unittest.mock import patch
++from typing import Any, Callable, Optional, Union
 +
 +import numpy as np
-+import pytest
-+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-+                                                       UserMessage)
-+from mistral_common.protocol.instruct.request import ChatCompletionRequest
-+from PIL import Image
-+
-+from vllm.config import ModelConfig
-+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
-+from vllm.inputs import InputProcessingContext
-+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-+                             MultiModalKwargs)
-+from vllm.multimodal.processing import BaseMultiModalProcessor
-+from vllm.multimodal.utils import group_mm_kwargs_by_modality
-+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-+from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
-+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-+from vllm.v1.engine.core import EngineCore as V1EngineCore
-+
-+from ....conftest import VllmRunner
-+from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
-+from ...utils import dummy_hf_overrides
-+
-+ARCH_TO_SKIP = {
-+    "MolmoForCausalLM": "incompatible requirements",
-+}
-+ARCH_NEEDS_EXTRAS = [
-+    "InternVLChatModel",
-+    "Idefics3ForConditionalGeneration",
-+    "LlavaForConditionalGeneration",
-+    "MiniCPMV",
-+    "PaliGemmaForConditionalGeneration",
-+]
-+REPO_ID_TO_SKIP = {
-+    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-+    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
-+    # after support PP for GPT-OSS
-+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
-+}
-+
-+ImageInput = list[Image.Image]
-+VideoInput = Union[list[Image.Image], list[np.ndarray],
-+                   list[tuple[np.ndarray, dict[str, Any]]]]
-+AudioInput = list[tuple[np.ndarray, int]]
-+
-+
-+def _resize_data(_data: Union[Image.Image, np.ndarray],
-+                 size_factor: float) -> Union[Image.Image, np.ndarray]:
-+    assert size_factor <= 1, "Size factor must be less than 1"
-+    # Image input
-+    if isinstance(_data, Image.Image):
-+        W, H = _data.width, _data.height
-+        W, H = map(lambda x: int(x * size_factor), (W, H))
-+        return _data.resize((W, H))
-+    # Video input with PIL Images
-+    elif is_list_of(_data, Image.Image):
-+        W, H = next(iter(_data)).width, next(iter(_data)).height
-+        T = len(_data)
-+        T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
-+        return [d.resize((W, H)) for d in _data[:T]]
-+    # Video input with numpy arrays
-+    elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
-+        T, H, W, C = _data.shape[-4:]
-+        T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
-+        return _data[..., :T, :H, :W, :C]
-+    # Audio input
-+    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
-+        return _data[:int(len(_data) * size_factor)]
-+    raise AssertionError("This line should be unreachable.")
-+
-+
-+def resize_mm_data(
-+    data: Union[ImageInput, VideoInput, AudioInput],
-+    size_factors: tuple[float,
-+                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
-+    size_factors = size_factors[:len(data)]
-+    if is_list_of(data, (Image.Image, np.ndarray, list)):
-+        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
-+    elif is_list_of(data, tuple):
-+        return [(_resize_data(d, s), meta)
-+                for (d, meta), s in zip(data, size_factors)]
-+    raise ValueError("Unsupported multimodal data type.")
-+
-+
-+def create_batched_mm_kwargs(
-+    model_config: ModelConfig,
-+    processor: BaseMultiModalProcessor,
-+    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
-+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
-+    processing_info = processor.info
-+    dummy_inputs = processor.dummy_inputs
-+    supported_mm_limits = processing_info.get_supported_mm_limits()
-+    mm_counts = {
-+        modality: 3 if limit is None else limit
-+        for modality, limit in supported_mm_limits.items()
-+    }
-+    processor_inputs = dummy_inputs.get_dummy_processor_inputs(
-+        seq_len=model_config.max_model_len,
-+        mm_counts=mm_counts,
-+    )
-+    mm_data = processor_inputs.mm_data
-+    resized_mm_data = {
-+        modality: resize_mm_data(data, size_factors)
-+        for modality, data in mm_data.items()
-+    }
-+    # Mistral chat outputs tokens directly, rather than text prompts
-+    if model_config.tokenizer_mode == "mistral":
-+        images = resized_mm_data.get("image", [])
-+        request = ChatCompletionRequest(messages=[
-+            UserMessage(content=[
-+                TextChunk(text=""),
-+                *(ImageChunk(image=image) for image in images),
-+            ]),
-+        ])
-+        tokenizer = processing_info.get_tokenizer()
-+        res = tokenizer.mistral.encode_chat_completion(request)
-+        prompt = res.tokens
-+    else:
-+        prompt = processor_inputs.prompt
-+    mm_kwargs = processor.apply(
-+        prompt=prompt,
-+        mm_data=resized_mm_data,
-+        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-+        tokenization_kwargs=processor_inputs.tokenization_kwargs,
-+    )["mm_kwargs"]
-+    items = [
-+        item for modality in supported_mm_limits
-+        for item in mm_kwargs[modality]
-+    ]
-+    return group_mm_kwargs_by_modality(items)
-+
-+
-+def get_model_id_to_test(
-+        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
-+    filtered_results = []
-+    for model_arch in model_arch_list:
-+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-+        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
-+            available_repos = list(
-+                map(lambda model_id: (model_arch, model_id),
-+                    [model_info.default, *model_info.extras.values()]))
-+            filtered_results.extend(available_repos)
-+        else:
-+            filtered_results.append((model_arch, model_info.default))
-+    return filtered_results
-+
-+
-+@pytest.mark.parametrize(
-+    "model_arch, model_id",
-+    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-+def test_model_tensor_schema(model_arch: str, model_id: str,
-+                             vllm_runner: type[VllmRunner], monkeypatch):
-+    if model_arch in ARCH_TO_SKIP:
-+        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
-+    if model_id in REPO_ID_TO_SKIP:
-+        pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}")
-+
-+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-+    model_info.check_available_online(on_fail="skip")
-+    model_info.check_transformers_version(on_fail="skip",
-+                                          check_max_version=False)
-+
-+    hf_overrides_fn = partial(dummy_hf_overrides,
-+                              model_arch=model_arch,
-+                              exist_overrides=model_info.hf_overrides)
-+
-+    model_config = ModelConfig(
-+        model_id,
-+        tokenizer=model_info.tokenizer or model_id,
-+        tokenizer_mode=model_info.tokenizer_mode,
-+        revision=model_info.revision,
-+        trust_remote_code=model_info.trust_remote_code,
-+        hf_overrides=model_info.hf_overrides,
-+    )
-+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-+
-+    if not any(
-+            hasattr(model_cls, f"_parse_and_validate_{m}_input")
-+            for m in ["image", "video", "audio"]):
-+        pytest.skip(f"{model_arch} does not support tensor schema validation.")
-+
-+    ctx = InputProcessingContext(
-+        model_config,
-+        tokenizer=cached_tokenizer_from_config(model_config),
-+    )
-+    processing_info = factories.info(ctx)
-+    supported_mm_limits = processing_info.get_supported_mm_limits()
-+    limit_mm_per_prompt = {
-+        modality: 3 if limit is None else limit
-+        for modality, limit in supported_mm_limits.items()
-+    }
-+
-+    # Avoid calling model.forward()
-+    def _initialize_kv_caches_v0(self) -> None:
-+        self.cache_config.num_gpu_blocks = 0
-+        self.cache_config.num_cpu_blocks = 0
-+
-+    def _initialize_kv_caches_v1(self, vllm_config):
-+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-+        scheduler_kv_cache_config = get_kv_cache_config(
-+            vllm_config,
-+            kv_cache_specs[0],
-+            10 * GiB_bytes,
-+        )
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import PretrainedConfig
++from transformers.feature_extraction_utils import BatchFeature
++from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import (
++    Qwen3OmniMoeConfig,
++    Qwen3OmniMoeThinkerConfig,
++)
++from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
++    Qwen3OmniMoeAudioEncoder,
++)
++from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
++    Qwen3OmniMoeProcessor,
++)
++from transformers.models.whisper import WhisperFeatureExtractor
 +
-+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-+        return 1, 0, scheduler_kv_cache_config
-+
-+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-+                       _initialize_kv_caches_v0),
-+          patch.object(V1EngineCore, "_initialize_kv_caches",
-+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-+        if model_info.v0_only:
-+            m.setenv("VLLM_USE_V1", "0")
-+
-+        # TODO(Isotr0py): Can we avoid initializing engine?
-+        with (
-+                set_default_torch_num_threads(1),
-+                vllm_runner(
-+                    model_id,
-+                    tokenizer_name=model_info.tokenizer,
-+                    tokenizer_mode=model_info.tokenizer_mode,
-+                    revision=model_info.revision,
-+                    trust_remote_code=model_info.trust_remote_code,
-+                    max_model_len=model_info.max_model_len,
-+                    load_format="dummy",
-+                    hf_overrides=hf_overrides_fn,
-+                    limit_mm_per_prompt=limit_mm_per_prompt,
-+                    enforce_eager=True,
-+                ) as vllm_model,
-+        ):
-+            model_config = vllm_model.llm.llm_engine.model_config
-+            llm_engine = vllm_model.llm.llm_engine
++# from vllm.attention.backends.registry import _Backend
++from vllm.platforms.interface import _Backend
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.qwen2_audio import (
++    Qwen2AudioFeatureInputs,
++    Qwen2AudioProcessingInfo,
++)
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import MultiModalKwargsItems
++from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
++from vllm.multimodal.processing import (
++    BaseMultiModalProcessor,
++    MultiModalPromptUpdates,
++    PlaceholderFeaturesInfo,
++    PromptReplacement,
++    PromptUpdate,
++)
++from vllm.sequence import IntermediateTensors
 +
-+            if hasattr(llm_engine, "processor"):
-+                # v1 processor
-+                mm_registry = llm_engine.processor.mm_registry
-+            else:
-+                # v0 input_preprocessor
-+                mm_registry = llm_engine.input_preprocessor.mm_registry
-+
-+            processor = mm_registry.create_processor(model_config)
-+
-+            def validate_model_input(model, modality: str,
-+                                     mm_kwargs: MultiModalKwargs):
-+                method_name = f"_parse_and_validate_{modality}_input"
-+                if hasattr(model, method_name):
-+                    getattr(model, method_name)(**mm_kwargs)
-+
-+            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-+                    model_config, processor):
-+                valid_func = partial(validate_model_input,
-+                                     modality=modality,
-+                                     mm_kwargs=mm_kwargs)
-+                vllm_model.apply_model(valid_func)
-diff --git a/tests/models/registry.py b/tests/models/registry.py
-index 84ca0bc60..6d32122a4 100644
---- a/tests/models/registry.py
-+++ b/tests/models/registry.py
-@@ -267,6 +267,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
-     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
-     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-+    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
-+                                          trust_remote_code=True,
-+                                          is_available_online=False),
-     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
-     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
-     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
-@@ -373,7 +376,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
-                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
-     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
-                                          extras={"2B": "OpenGVLab/InternVL2-2B",
--                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
-+                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
-+                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
-+                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
-+                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
-                                          trust_remote_code=True),
-     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
-                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
-@@ -397,7 +403,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
-     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                 trust_remote_code=True),
-     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
--                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
-+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
-                                 trust_remote_code=True),
-     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
-                                               trust_remote_code=True,
-diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
-index 08d9573ec..82a0e0cd8 100644
---- a/tests/quantization/test_cpu_offload.py
-+++ b/tests/quantization/test_cpu_offload.py
-@@ -1,4 +1,4 @@
--# SPDX-License-Identifier: Apache-2.0
-+# SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- 
- # Expanded quantized model tests for CPU offloading
-@@ -11,6 +11,16 @@ from tests.quantization.utils import is_quant_method_supported
- from ..utils import compare_two_settings
- 
- 
-+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-+                    reason="fp8 is not supported on this GPU type.")
-+def test_offload_weights_before_quant_fp8():
-+    # Test quantization of an unquantized checkpoint
-+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
-+                         ["--quantization", "fp8"], ["--quantization", "fp8"],
-+                         {"VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT": "1"},
-+                         max_wait_seconds=480)
-+
-+
- @pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                     reason="fp8 is not supported on this GPU type.")
- def test_cpu_offload_fp8():
-diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
-index 34b1b6c2e..4c8082646 100644
---- a/tests/quantization/test_ipex_quant.py
-+++ b/tests/quantization/test_ipex_quant.py
-@@ -25,7 +25,7 @@ DTYPE = ["bfloat16"]
- @pytest.mark.parametrize("model", MODELS)
- @pytest.mark.parametrize("dtype", DTYPE)
- def test_ipex_quant(vllm_runner, model, dtype):
--    with vllm_runner(model, dtype=dtype) as llm:
-+    with vllm_runner(model, dtype=dtype, enforce_eager=True, block_size=64) as llm:
-         output = llm.generate_greedy(["The capital of France is"],
-                                      max_tokens=32)
-     assert output
-diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
-new file mode 100644
-index 000000000..d85bc9bbf
---- /dev/null
-+++ b/tests/tool_use/test_seed_oss_tool_parser.py
-@@ -0,0 +1,459 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+# ruff: noqa: E501
++from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
++from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 +
-+import json
-+from collections.abc import Generator
-+from typing import Optional
-+
-+import pytest
-+
-+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-+                                              ChatCompletionToolsParam,
-+                                              DeltaMessage, FunctionCall,
-+                                              ToolCall)
-+from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
-+from vllm.transformers_utils.detokenizer import detokenize_incrementally
-+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
-+
-+# Use a common model that is likely to be available
-+MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
-+
-+
-+@pytest.fixture(scope="module")
-+def seed_oss_tokenizer():
-+    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
-+
-+
-+@pytest.fixture
-+def seed_oss_tool_parser(seed_oss_tokenizer):
-+    return SeedOssToolParser(seed_oss_tokenizer)
-+
-+
-+@pytest.fixture
-+def sample_tools():
-+    return [
-+        ChatCompletionToolsParam(
-+            type="function",
-+            function={
-+                "name": "get_weather",
-+                "description": "Get current temperature for a given location.",
-+                "parameters": {
-+                    "type": "object",
-+                    "properties": {
-+                        "location": {
-+                            "type": "string",
-+                            "description":
-+                            "City and country e.g. Bogotá, Colombia"
-+                        },
-+                        "unit": {
-+                            "type": "string",
-+                            "description": "this is the unit of temperature"
-+                        }
-+                    },
-+                    "required": ["location"],
-+                    "additionalProperties": False
-+                },
-+                "returns": {
-+                    "type": "object",
-+                    "properties": {
-+                        "temperature": {
-+                            "type": "number",
-+                            "description": "temperature in celsius"
-+                        }
-+                    },
-+                    "required": ["temperature"],
-+                    "additionalProperties": False
-+                },
-+                "strict": True
-+            }),
-+    ]
++# yapf conflicts with isort for this block
++# yapf: disable
++from .qwen2_5_omni_thinker import (
++    Qwen2_5OmniConditionalGenerationMixin,
++    Qwen2_5OmniThinkerDummyInputsBuilder,
++    Qwen2_5OmniThinkerMultiModalProcessor,
++    Qwen2_5OmniThinkerProcessingInfo,
++)
 +
++# yapf: enable
++from .qwen2_5_vl import (
++    Qwen2_5_VisionAttention,
++    Qwen2_5_VisionRotaryEmbedding,
++    Qwen2_5_VLProcessingInfo,
++)
++from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
++from .utils import (
++    AutoWeightsLoader,
++    WeightsMapper,
++    _merge_multimodal_embeddings,
++    merge_multimodal_embeddings,
++    maybe_prefix,
++)
++from .vision import get_vit_attn_backend
 +
-+def assert_tool_calls(actual_tool_calls: list[ToolCall],
-+                      expected_tool_calls: list[ToolCall]):
-+    assert len(actual_tool_calls) == len(expected_tool_calls)
++try:
++    import flash_attn
++except (ImportError, ModuleNotFoundError):
++    flash_attn = None
 +
-+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-+                                                    expected_tool_calls):
-+        # Seed-OSS tool call will not generate id
-+        assert actual_tool_call.type == "function"
-+        assert actual_tool_call.function == expected_tool_call.function
++logger = init_logger(__name__)
 +
-+        assert actual_tool_call.function.name == expected_tool_call.function.name
-+        assert actual_tool_call.function.arguments == expected_tool_call.function.arguments
 +
++class Qwen3_VisionPatchEmbed(nn.Module):
++    def __init__(
++        self,
++        patch_size: int = 14,
++        temporal_patch_size: int = 2,
++        in_channels: int = 3,
++        hidden_size: int = 1152,
++    ) -> None:
++        super().__init__()
++        self.patch_size = patch_size
++        self.temporal_patch_size = temporal_patch_size
++        self.hidden_size = hidden_size
 +
-+def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
-+    model_output = "This is a test response without any tool calls"
-+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-+        model_output, request=None)  # type: ignore[arg-type]
++        kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.proj = nn.Conv3d(
++            in_channels,
++            hidden_size,
++            kernel_size=kernel_size,
++            stride=kernel_size,
++            bias=True,
++        )
 +
-+    assert not extracted_tool_calls.tools_called
-+    assert extracted_tool_calls.tool_calls == []
-+    assert extracted_tool_calls.content == model_output
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        L, C = x.shape
++        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
++        x = self.proj(x).view(L, self.hidden_size)
++        return x
 +
 +
-+@pytest.mark.parametrize(
-+    ids=[
-+        "tool_call_0_thinking_budget",
-+        "tool_call_512_thinkg_budget",
-+        "tool_call_unlimited_thinking_budget",
-+    ],
-+    argnames=["model_output", "expected_tool_calls", "expected_content"],
-+    argvalues=[
-+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         """<seed:tool_call>\n<function=get_weather>\n"""
-+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-+         [
-+             ToolCall(function=FunctionCall(
-+                 name="get_weather",
-+                 arguments=json.dumps({
-+                     "location": "Barcelona, Spain",
-+                 }, ),
-+             ),
-+                      type='function')
-+         ],
-+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         ),
-+        (
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
-+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
-+            """\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps({
-+                        "location": "Barcelona, Spain",
-+                    }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
-+        ),
-+        (
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
-+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps(
-+                        {
-+                            "location": "Barcelona, Spain",
-+                            "unit": "celsius",
-+                        }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think>""",
-+        ),
-+    ],
-+)
-+def test_extract_tool_calls(seed_oss_tool_parser, sample_tools, model_output,
-+                            expected_tool_calls, expected_content):
-+    request = ChatCompletionRequest(model=MODEL,
-+                                    messages=[],
-+                                    tools=sample_tools)
-+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-+        model_output, request=request)  # type: ignore[arg-type]
-+    assert extracted_tool_calls.tools_called
-+
-+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
-+
-+    assert extracted_tool_calls.content == expected_content
-+
-+
-+def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
-+    model_output = "This is a test response without any tool calls"
-+
-+    result = seed_oss_tool_parser.extract_tool_calls_streaming(
-+        previous_text="his is a test response",
-+        current_text=model_output,
-+        delta_text=" without any tool calls.",
-+        previous_token_ids=[],
-+        current_token_ids=[],
-+        delta_token_ids=[],
-+        request=None,
-+    )
++class Qwen3_VisionMLP(nn.Module):
++    def __init__(
++        self,
++        in_features: int,
++        hidden_features: int,
++        bias: bool = False,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.linear_fc1 = ColumnParallelLinear(
++            in_features,
++            hidden_features,
++            bias=bias,
++            quant_config=quant_config,
++            return_bias=False,
++            prefix=f"{prefix}.linear_fc1",
++        )
++        self.linear_fc2 = RowParallelLinear(
++            hidden_features,
++            in_features,
++            bias=bias,
++            quant_config=quant_config,
++            return_bias=False,
++            prefix=f"{prefix}.linear_fc2",
++        )
++        self.act_fn = act_fn
++
++    def forward(self, x: torch.Tensor):
++        mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
++        return mlp_output
++
++
++class Qwen3_VisionBlock(nn.Module):
++    def __init__(
++        self,
++        dim: int,
++        num_heads: int,
++        mlp_hidden_dim: int,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.norm1 = norm_layer(dim)
++        self.norm2 = norm_layer(dim)
++        self.attn = Qwen2_5_VisionAttention(
++            embed_dim=dim,
++            num_heads=num_heads,
++            projection_size=dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++        self.mlp = Qwen3_VisionMLP(
++            dim,
++            mlp_hidden_dim,
++            act_fn=act_fn,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
 +
-+    # Should return the delta text as content
-+    assert result is not None
-+    assert hasattr(result, 'content')
-+    assert result.content == " without any tool calls."
-+
-+
-+def stream_delta_message_generator(
-+    seed_oss_tool_parser: SeedOssToolParser,
-+    seed_oss_tokenizer: AnyTokenizer,
-+    model_output: str,
-+    request: Optional[ChatCompletionRequest] = None
-+) -> Generator[DeltaMessage, None, None]:
-+    all_token_ids = seed_oss_tokenizer.encode(model_output,
-+                                              add_special_tokens=False)
-+
-+    previous_text = ""
-+    previous_tokens = None
-+    prefix_offset = 0
-+    read_offset = 0
-+    for i, delta_token in enumerate(all_token_ids):
-+        delta_token_ids = [delta_token]
-+        previous_token_ids = all_token_ids[:i]
-+        current_token_ids = all_token_ids[:i + 1]
-+
-+        (new_tokens, delta_text, new_prefix_offset,
-+         new_read_offset) = detokenize_incrementally(
-+             tokenizer=seed_oss_tokenizer,
-+             all_input_ids=current_token_ids,
-+             prev_tokens=previous_tokens,
-+             prefix_offset=prefix_offset,
-+             read_offset=read_offset,
-+             skip_special_tokens=False,
-+             spaces_between_special_tokens=True,
-+         )
-+
-+        current_text = previous_text + delta_text
-+
-+        delta_message = seed_oss_tool_parser.extract_tool_calls_streaming(
-+            previous_text,
-+            current_text,
-+            delta_text,
-+            previous_token_ids,
-+            current_token_ids,
-+            delta_token_ids,
-+            request=request,
++    def forward(
++        self,
++        x: torch.Tensor,
++        cu_seqlens: torch.Tensor,
++        rotary_pos_emb: torch.Tensor,
++        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
++        seqlens: Optional[list[int]] = None,  # Only used for xFormers
++    ) -> torch.Tensor:
++        x = x + self.attn(
++            self.norm1(x),
++            cu_seqlens=cu_seqlens,
++            rotary_pos_emb=rotary_pos_emb,
++            max_seqlen=max_seqlen,
++            seqlens=seqlens,
 +        )
-+        if delta_message:
-+            yield delta_message
 +
-+        previous_text = current_text
-+        previous_tokens = (previous_tokens +
-+                           new_tokens if previous_tokens else new_tokens)
-+        prefix_offset = new_prefix_offset
-+        read_offset = new_read_offset
++        x = x + self.mlp(self.norm2(x))
++        return x
 +
 +
-+@pytest.mark.parametrize(
-+    ids=[
-+        "tool_call_0_thinking_budget",
-+        "tool_call_512_thinkg_budget",
-+        "tool_call_unlimited_thinking_budget",
-+    ],
-+    argnames=["model_output", "expected_tool_calls", "expected_content"],
-+    argvalues=[
-+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         """<seed:tool_call>\n<function=get_weather>\n"""
-+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-+         [
-+             ToolCall(function=FunctionCall(
-+                 name="get_weather",
-+                 arguments=json.dumps({
-+                     "location": "Barcelona, Spain",
-+                 }, ),
-+             ),
-+                      type='function')
-+         ],
-+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         ),
-+        (
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
-+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
-+            """\n</seed:tool_call>""",
++class Qwen3_VisionPatchMerger(nn.Module):
++    def __init__(
++        self,
++        d_model: int,
++        context_dim: int,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        spatial_merge_size: int = 2,
++        use_postshuffle_norm: bool = False,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++
++        self.use_postshuffle_norm = use_postshuffle_norm
++        if self.use_postshuffle_norm:
++            context_dim = self.hidden_size
++
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.use_postshuffle_norm = use_postshuffle_norm
++        self.ln_q = norm_layer(
++            self.hidden_size if use_postshuffle_norm else context_dim
++        )
++        self.mlp = nn.ModuleList(
 +            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps({
-+                        "location": "Barcelona, Spain",
-+                    }, ),
++                ColumnParallelLinear(
++                    self.hidden_size,
++                    self.hidden_size,
++                    bias=True,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.mlp.0",
 +                ),
-+                         type='function')
-+            ],
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
-+        ),
-+        (
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
-+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps(
-+                        {
-+                            "location": "Barcelona, Spain",
-+                            "unit": "celsius",
-+                        }, ),
++                nn.GELU(),
++                RowParallelLinear(
++                    self.hidden_size,
++                    d_model,
++                    bias=True,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.mlp.2",
 +                ),
-+                         type='function')
-+            ],
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think>""",
-+        ),
-+    ],
-+)
-+def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
-+                              sample_tools, model_output, expected_tool_calls,
-+                              expected_content):
-+    """Test incremental streaming behavior"""
-+    request = ChatCompletionRequest(model=MODEL,
-+                                    messages=[],
-+                                    tools=sample_tools)
-+
-+    other_content = ''
-+    tool_states = {}  # Track state per tool index
-+
-+    for delta_message in stream_delta_message_generator(
-+            seed_oss_tool_parser, seed_oss_tokenizer, model_output, request):
-+        # role should never be streamed from tool parser
-+        assert not delta_message.role
-+
-+        if delta_message.content:
-+            other_content += delta_message.content
-+
-+        if delta_message.tool_calls:
-+            for tool_call in delta_message.tool_calls:
-+                idx = tool_call.index
-+
-+                # Initialize state for new tool
-+                if idx not in tool_states:
-+                    tool_states[idx] = {
-+                        "id": None,
-+                        "name": None,
-+                        "arguments": "",
-+                        "type": None
-+                    }
-+
-+                # First chunk should have id, name, and type
-+                if tool_call.id:
-+                    tool_states[idx]["id"] = tool_call.id
-+
-+                if tool_call.type:
-+                    assert tool_call.type == "function"
-+                    tool_states[idx]["type"] = tool_call.type
-+
-+                if tool_call.function:
-+                    if tool_call.function.name:
-+                        # Should only be set once
-+                        assert tool_states[idx]["name"] is None
-+                        tool_states[idx]["name"] = tool_call.function.name
-+
-+                    if tool_call.function.arguments is not None:
-+                        # Accumulate arguments incrementally
-+                        tool_states[idx][
-+                            "arguments"] += tool_call.function.arguments
-+
-+    # Verify final content
-+    assert other_content == expected_content
-+
-+    # Verify we got all expected tool calls
-+    assert len(tool_states) == len(expected_tool_calls)
-+
-+    # Verify each tool call
-+    for idx, expected_tool in enumerate(expected_tool_calls):
-+        state = tool_states[idx]
-+        assert state["id"] is not None
-+        assert state["type"] == "function"
-+        assert state["name"] == expected_tool.function.name
-+
-+        # Parse accumulated arguments
-+        arguments_str = state["arguments"]
-+        assert arguments_str is not None
-+        actual_args = json.loads(arguments_str)
-+        expected_args = json.loads(expected_tool.function.arguments)
-+        assert actual_args == expected_args
-diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
-index 30cfbdda5..f3ce33fcf 100644
---- a/tests/v1/attention/utils.py
-+++ b/tests/v1/attention/utils.py
-@@ -93,6 +93,7 @@ def create_common_attn_metadata(
-         max_query_len=max_query_len,
-         block_table_tensor=block_table_tensor,
-         slot_mapping=slot_mapping,
-+        causal=True,
-     )
- 
- 
-diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
-index 277ea3c83..1624701b0 100644
---- a/tests/v1/e2e/test_correctness_sliding_window.py
-+++ b/tests/v1/e2e/test_correctness_sliding_window.py
-@@ -18,7 +18,7 @@ class TestConfig:
- 
- model_config = {
-     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
--    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
-+    #"google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
- }
- 
- 
-@@ -26,7 +26,7 @@ model_config = {
-     "model",
-     [
-         "bigcode/starcoder2-3b",  # sliding window only
--        "google/gemma-3-1b-it",  # sliding window + full attention
-+        #"google/gemma-3-1b-it",  # sliding window + full attention
-     ])
- @pytest.mark.parametrize("batch_size", [5])
- @pytest.mark.parametrize("seed", [1])
-@@ -42,7 +42,7 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
- 
-         test_config = model_config[model]
- 
--        llm = LLM(model=model)
-+        llm = LLM(model=model, enforce_eager=True, block_size=64)
-         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
- 
-         prompts, answer, indices = prep_prompts(batch_size,
-diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
-index 2423f966a..f0343cae2 100644
---- a/tests/v1/e2e/test_spec_decode.py
-+++ b/tests/v1/e2e/test_spec_decode.py
-@@ -68,7 +68,7 @@ def test_ngram_correctness(
-     with monkeypatch.context() as m:
-         m.setenv("VLLM_USE_V1", "1")
- 
--        ref_llm = LLM(model=model_name, max_model_len=1024)
-+        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True, block_size=32, dtype="float16")
-         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-         del ref_llm
-         torch.cuda.empty_cache()
-@@ -83,6 +83,10 @@ def test_ngram_correctness(
-                 "num_speculative_tokens": 3,
-             },
-             max_model_len=1024,
-+            enforce_eager=True,
-+            block_size=64,
-+            dtype="float16",
-+            gpu_memory_utilization=0.6,
-         )
-         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-         matches = 0
-@@ -131,7 +135,12 @@ def test_eagle_correctness(
- 
-         ref_llm = LLM(model=model_name,
-                       max_model_len=2048,
--                      tensor_parallel_size=tp_size)
-+                      tensor_parallel_size=tp_size,
-+                      enforce_eager=True,
-+                      block_size=64,
-+                      dtype="float16",
-+                      gpu_memory_utilization=0.6,
-+                      )
-         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-         del ref_llm
-         torch.cuda.empty_cache()
-@@ -147,6 +156,10 @@ def test_eagle_correctness(
-                 "num_speculative_tokens": 3,
-                 "max_model_len": 2048,
-             },
-+            enforce_eager=True,
-+            block_size=64,
-+            dtype="float16",
-+            gpu_memory_utilization=0.6,
-             max_model_len=2048,
-         )
-         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
-new file mode 100644
-index 000000000..45779d169
---- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
-@@ -0,0 +1,162 @@
-+#!/bin/bash
-+set -xe
++            ]
++        )
 +
-+# Hosts / ports
-+PREFILL_HOST=${PREFILL_HOST:-"localhost"}
-+PREFILL_PORT=${PREFILL_PORT:-8100}
-+PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
-+DECODE_HOST=${DECODE_HOST:-"localhost"}
-+DECODE_PORT=${DECODE_PORT:-8200}
-+PROXY_HOST=${PROXY_HOST:-"localhost"}
-+PROXY_PORT=${PROXY_PORT:-8192}
-+BASELINE_HOST=${BASELINE_HOST:-"localhost"}
-+BASELINE_PORT=${BASELINE_PORT:-9290}
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.use_postshuffle_norm:
++            x = self.ln_q(x.view(-1, self.hidden_size))
++        else:
++            x = self.ln_q(x).view(-1, self.hidden_size)
 +
++        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
++        x_parallel, _ = mlp_fc1(x)
++        x_parallel = mlp_act(x_parallel)
++        out, _ = mlp_fc2(x_parallel)
++        return out
 +
-+# Model to run.
-+MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
-+MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
-+BLOCK_SIZE=${BLOCK_SIZE:-32}
 +
++class Qwen3Omni_VisionTransformer(nn.Module):
++    def __init__(
++        self,
++        vision_config,
++        norm_eps: float = 1e-6,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = vision_config.hidden_size
++        self.num_heads = vision_config.num_heads
++        self.image_size = vision_config.image_size
++        self.patch_size = vision_config.patch_size
++        self.spatial_merge_size = vision_config.spatial_merge_size
++        self.spatial_merge_unit = self.spatial_merge_size**2
++        self.temporal_patch_size = vision_config.temporal_patch_size
++        self.num_grid_per_side = self.image_size // self.patch_size
++        self.apply_vit_abs_pos_embed = vision_config.apply_vit_abs_pos_embed
++        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
++
++        self.patch_embed = Qwen3_VisionPatchEmbed(
++            patch_size=self.patch_size,
++            temporal_patch_size=self.temporal_patch_size,
++            in_channels=vision_config.in_channels,
++            hidden_size=self.hidden_size,
++        )
 +
-+# execution env
-+GIT_ROOT=$(git rev-parse --show-toplevel)
-+EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
-+CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
-+CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
++        # vit pos embeding, TODO: spatial_patch_size vs patch_size
++        if self.apply_vit_abs_pos_embed:
++            self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
++        else:
++            self.pos_embed = nn.Parameter(
++                torch.empty([1, self.num_grid_per_side**2, self.hidden_size])
++            )
 +
-+OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
++        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
++        head_dim = self.hidden_size // self.num_heads
++        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
 +
-+# Trap the SIGINT signal (triggered by Ctrl+C)
-+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
++        self.blocks = nn.ModuleList(
++            [
++                Qwen3_VisionBlock(
++                    dim=self.hidden_size,
++                    num_heads=self.num_heads,
++                    mlp_hidden_dim=vision_config.intermediate_size,
++                    act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
++                    norm_layer=norm_layer,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.blocks.{layer_idx}",
++                )
++                for layer_idx in range(vision_config.depth)
++            ]
++        )
++        self.merger = Qwen3_VisionPatchMerger(
++            d_model=vision_config.out_hidden_size,
++            context_dim=self.hidden_size,
++            norm_layer=norm_layer,
++            spatial_merge_size=self.spatial_merge_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.merger",
++        )
++        if self.deepstack_visual_indexes is not None:
++            self.merger_list = nn.ModuleList(
++                [
++                    Qwen3_VisionPatchMerger(
++                        d_model=vision_config.out_hidden_size,
++                        context_dim=self.hidden_size,
++                        spatial_merge_size=self.spatial_merge_size,
++                        use_postshuffle_norm=True,
++                        norm_layer=norm_layer,
++                        quant_config=quant_config,
++                        prefix=f"{prefix}.merger_list.{layer_idx}",
++                    )
++                    for layer_idx in range(len(self.deepstack_visual_indexes))
++                ]
++            )
 +
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype()
++        )
++        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
++            torch.get_default_dtype()
++        ):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.proj.weight.device
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
++        num_grid_per_side = self.num_grid_per_side
++        m_size = self.spatial_merge_size
++        hidden_dim = self.pos_embed.embedding_dim
++
++        outputs = []
++        for t, h, w in grid_thw:
++            h_idxs = torch.linspace(
++                0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device
++            )
++            w_idxs = torch.linspace(
++                0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device
++            )
 +
-+# Waits for vLLM server to start.
-+wait_for_server() {
-+  local host=$1
-+  local port=$2
-+  timeout 1200 bash -c "
-+    until curl -s ${host}:${port}/v1/completions > /dev/null; do
-+      sleep 1
-+    done" && return 0 || return 1
-+}
++            h_floor = h_idxs.to(torch.long)
++            w_floor = w_idxs.to(torch.long)
++            h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
++            w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)
++
++            dh = h_idxs - h_floor
++            dw = w_idxs - w_floor
++
++            # Create meshgrid view for all h, w vars
++            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
++            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
++            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
++            h_floor_grid_idx = h_floor_grid * num_grid_per_side
++            h_ceil_grid_idx = h_ceil_grid * num_grid_per_side
++
++            # original computation of weights
++            # w00 = (1 - dh_grid) * (1 - dw_grid)
++            # w01 = (1 - dh_grid) * dw_grid
++            # w10 = dh_grid * (1 - dw_grid)
++            # w11 = dh_grid * dw_grid
++            # we reuse w11 here to avoid duplicate
++            # dh_grid * dw_grid computation
++            w11 = dh_grid * dw_grid
++            w10 = dh_grid - w11
++            w01 = dw_grid - w11
++            w00 = 1 - dh_grid - dw_grid + w11
++
++            idx00 = h_floor_grid_idx + w_floor_grid
++            idx01 = h_floor_grid_idx + w_ceil_grid
++            idx10 = h_ceil_grid_idx + w_floor_grid
++            idx11 = h_ceil_grid_idx + w_ceil_grid
++
++            indices = torch.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1)
++            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
++            weights = weights.to(dtype=self.dtype, device=self.device)
++
++            embeds = self.pos_embed(indices)
++            weighted_embeds = embeds * weights
++            p0, p1, p2, p3 = weighted_embeds.unbind(dim=0)
++            combined = p0 + p1 + p2 + p3
++
++            combined = combined.view(h * w, hidden_dim)
++            repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous()
++            repeated = repeated.view(
++                t, h // m_size, m_size, w // m_size, m_size, hidden_dim
++            )
++            repeated = repeated.permute(0, 1, 3, 2, 4, 5).reshape(-1, hidden_dim)
++            outputs.append(repeated)
 +
-+# Cleanup function
-+cleanup() {
-+    echo "Caught Ctrl+C, cleaning up..."
-+    # Cleanup commands
-+    pgrep python | xargs kill -9 || true
-+    # pkill -f python || true
-+    echo "Cleanup complete. Exiting."
-+}
++        return torch.cat(outputs, dim=0)
 +
-+launch_baseline() {
-+  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${BASELINE_HOST} \
-+      --port ${BASELINE_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --enforce-eager"
-+  echo ${BASELINE_BASE_CMD}
-+  ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
-+}
++    def compute_attn_mask_seqlen(
++        self,
++        cu_seqlens: torch.Tensor,
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
 +
-+launch_pd() {
-+  PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
-+  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${PREFILL_HOST} \
-+      --port ${PREFILL_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
++    def forward(
++        self,
++        x: torch.Tensor,
++        grid_thw: list[list[int]],
++    ) -> torch.Tensor:
++        hidden_states = x.to(device=self.device, dtype=self.dtype)
++        hidden_states = self.patch_embed(hidden_states)
++
++        if self.apply_vit_abs_pos_embed:
++            pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
++            hidden_states = hidden_states + pos_embeds
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
++        ).cumsum(
++            dim=0,
++            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
++        )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 +
++        hidden_states = hidden_states.unsqueeze(1)
++        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
 +
-+  DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${DECODE_HOST} \
-+      --port ${DECODE_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
++        hidden_states_list = []
++        deepstack_visual_indexes = self.deepstack_visual_indexes
 +
-+  echo ${PREFILL_BASE_CMD}
-+  echo ${DECODE_BASE_CMD}
-+  sleep 2
++        for layer_num, blk in enumerate(self.blocks):
++            hidden_states = blk(
++                hidden_states,
++                cu_seqlens=cu_seqlens,
++                rotary_pos_emb=rotary_pos_emb,
++                max_seqlen=max_seqlen,
++                seqlens=seqlens,
++            )
++            if (
++                deepstack_visual_indexes is not None
++                and layer_num in deepstack_visual_indexes
++            ):
++                hidden_states_list.append(hidden_states)
++
++        hidden_states = self.merger(hidden_states)
++
++        # processing deepstack
++        if deepstack_visual_indexes is not None:
++            processed_hidden_states_list = [hidden_states]
++            for idx, x in enumerate(hidden_states_list):
++                x = self.merger_list[idx](x)
++                processed_hidden_states_list.append(x)
++            # we cat the original visual features and deepstack features
++            # along the feature dim
++            hidden_states = torch.cat(
++                processed_hidden_states_list, dim=1
++            )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
 +
-+  # execute on hosts
-+  ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
-+  ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
-+  sleep 1
-+  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
-+  sleep 1
-+  wait_for_server ${DECODE_HOST} ${DECODE_PORT}
-+  sleep 1
-+}
++        return hidden_states
 +
-+launch_pd_proxy(){
-+  PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  python3 ${EXP_ROOT}/toy_proxy_server.py \
-+  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
-+  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
-+  --host=${PROXY_HOST} --port ${PROXY_PORT}"
-+  echo ${PROXY_BASE_CMD}
-+  ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
-+}
++    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("attn.qkv.", "attn.q.", "q"),
++            ("attn.qkv.", "attn.k.", "k"),
++            ("attn.qkv.", "attn.v.", "v"),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: set[str] = set()
 +
-+run_tests(){
-+  local service_url=$1
-+  local mode=$2
-+  python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
-+}
++        for name, loaded_weight in weights:
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
 +
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader", default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
 +
-+# run non-disagg. baseline & save outputs
-+launch_baseline
-+sleep 2
-+wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
-+run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
-+cleanup
-+sleep 10
 +
++@support_torch_compile(
++    dynamic_arg_dims={
++        "input_ids": 0,
++        "positions": -1,
++        "intermediate_tensors": 0,
++        "inputs_embeds": 0,
++        "deepstack_input_embeds": 0,
++    }
++)
++class Qwen3MoeLLMModel(Qwen3MoeModel):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
 +
-+# run disagg. & do exact-match with the outputs from baseline
-+launch_pd
-+launch_pd_proxy
-+sleep 10
-+run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
-+echo "-----P/D success----"
++        self.deepstack_multiscale_layer_start = 1
 +
-+rm ${OUTPUT_FILE}
-+cleanup
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for layer_idx, layer in enumerate(
++            self.layers[self.start_layer : self.end_layer]
++        ):
++            layer_idx = layer_idx + self.start_layer
 +
-+exit 0
-\ No newline at end of file
-diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
-new file mode 100644
-index 000000000..c37c92fdf
---- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
-@@ -0,0 +1,128 @@
-+#!/bin/bash
-+set -xe
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                residual,
++            )
 +
-+# Hosts / ports
-+PREFILL_HOST=${PREFILL_HOST:-"localhost"}
-+PREFILL_PORT=${PREFILL_PORT:-8100}
-+PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
-+DECODE_HOST=${DECODE_HOST:-"localhost"}
-+DECODE_PORT=${DECODE_PORT:-8200}
-+PROXY_HOST=${PROXY_HOST:-"localhost"}
-+PROXY_PORT=${PROXY_PORT:-8192}
-+BASELINE_HOST=${BASELINE_HOST:-"localhost"}
-+BASELINE_PORT=${BASELINE_PORT:-9290}
++            if deepstack_input_embeds is not None and layer_idx in range(
++                0, len(deepstack_input_embeds)
++            ):
++                hidden_states = (
++                    hidden_states
++                    + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"]
++                )
 +
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors(
++                {"hidden_states": hidden_states, "residual": residual}
++            )
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
 +
-+# Model to run.
-+MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
-+MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
-+BLOCK_SIZE=${BLOCK_SIZE:-32}
 +
++class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3MoeForCausalLM, self).__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = Qwen3MoeLLMModel(
++            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
++        )
++        self.lm_head = ParallelLMHead(
++            config.vocab_size, config.hidden_size, quant_config=quant_config
++        )
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors
++        )
 +
-+# execution env
-+GIT_ROOT=$(git rev-parse --show-toplevel)
-+EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
-+CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
-+CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
 +
-+OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
++class Qwen3OmniMoeThinkerProcessingInfo(
++    Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo
++):
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config
 +
-+# Trap the SIGINT signal (triggered by Ctrl+C)
-+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
++    def get_hf_processor(self, **kwargs: object) -> Qwen3OmniMoeProcessor:
++        processor = self.ctx.get_hf_processor(
++            Qwen3OmniMoeProcessor,
++            use_fast=kwargs.pop("use_fast", True),
++            **kwargs,
++        )
++        if not hasattr(processor, "audio_token"):
++            processor.audio_token = "<|audio_pad|>"
++        if not hasattr(processor, "image_token"):
++            processor.image_token = "<|image_pad|>"
++        if not hasattr(processor, "video_token"):
++            processor.video_token = "<|video_pad|>"
++        return processor
++
++    def get_feature_extractor(self, **kwargs: object):
++        hf_processor = self.get_hf_processor(**kwargs)
++        feature_extractor = hf_processor.feature_extractor  # type: ignore
++        assert isinstance(feature_extractor, WhisperFeatureExtractor)
++        return feature_extractor
 +
-+# Waits for vLLM server to start.
-+wait_for_server() {
-+  local host=$1
-+  local port=$2
-+  timeout 1200 bash -c "
-+    until curl -s ${host}:${port}/v1/completions > /dev/null; do
-+      sleep 1
-+    done" && return 0 || return 1
-+}
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"audio": None, "image": None, "video": None}
 +
-+# Cleanup function
-+cleanup() {
-+    echo "Caught Ctrl+C, cleaning up..."
-+    # Cleanup commands
-+    pgrep python | xargs kill -9 || true
-+    # pkill -f python || true
-+    echo "Cleanup complete. Exiting."
-+}
 +
++Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
 +
-+launch_pd() {
-+  PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
-+  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${PREFILL_HOST} \
-+      --port ${PREFILL_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 +
++class Qwen3OmniMoeThinkerMultiModalProcessor(
++    Qwen2_5OmniThinkerMultiModalProcessor,
++):
++    def _get_feat_extract_output_lengths(
++        self, input_lengths: torch.Tensor
++    ) -> torch.Tensor:
++        input_lengths_leave = input_lengths % 100
++        feat_lengths = (input_lengths_leave - 1) // 2 + 1
++        output_lengths = (
++            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++        )
++        return feat_lengths, output_lengths
 +
-+  DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${DECODE_HOST} \
-+      --port ${DECODE_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++        tok_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        audios = mm_data.pop("audios", [])
++
++        def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray:
++            length = x.shape[-1]
++            if length % hop_length != 0:
++                pad_length = hop_length - (length % hop_length)
++                x = np.pad(x, (0, pad_length), mode="constant", constant_values=0)
++            return x
++
++        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
++        if audios:
++            # NOTE: Qwen3-Omni processor accept "audio"
++            # To make sure the cache works with padding=True, we pre-padded
++            # the audio to multiple of hop_length.
++            hop_length = self.info.get_feature_extractor().hop_length
++            mm_data["audio"] = [
++                pad_to_hop_length(audio, hop_length)
++                if isinstance(audio, np.ndarray)
++                else (pad_to_hop_length(audio[0], hop_length), audio[1])
++                for audio in audios
++            ]
++            mm_kwargs = dict(
++                **mm_kwargs,
++            )
 +
-+  echo ${PREFILL_BASE_CMD}
-+  echo ${DECODE_BASE_CMD}
-+  sleep 2
++        hf_inputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++            tok_kwargs=tok_kwargs,
++        )
 +
-+  # execute on hosts
-+  ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
-+  ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
-+  sleep 1
-+  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
-+  sleep 1
-+  wait_for_server ${DECODE_HOST} ${DECODE_PORT}
-+  sleep 1
-+}
++        if (
++            "audio_feature_lengths" in hf_inputs
++            and "feature_attention_mask" in hf_inputs
++            and (audios := mm_data.get("audio", []))
++        ):
++            hop_length = self.info.get_feature_extractor().hop_length
++            audio_num_frames = []
++            for _, audio in enumerate(audios):
++                audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio)
++                num_frame = (
++                    (audio_length // hop_length)
++                    if audio_length % hop_length == 0
++                    else (audio_length // hop_length - 1)
++                )
++                audio_num_frames.append(num_frame)
++            hf_inputs["feature_attention_mask"] = [
++                torch.ones(num_frame) for num_frame in audio_num_frames
++            ]
++            hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames)
++        return hf_inputs
 +
-+launch_pd_proxy(){
-+  PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  python3 ${EXP_ROOT}/toy_proxy_server.py \
-+  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
-+  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
-+  --host=${PROXY_HOST} --port ${PROXY_PORT}"
-+  echo ${PROXY_BASE_CMD}
-+  ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
-+}
++    def _maybe_apply_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        prompt_ids: list[int],
++        mm_kwargs: MultiModalKwargsItems,
++        mm_prompt_updates: MultiModalPromptUpdates,
++        is_update_applied: bool,
++    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
++        """
++        Qwen3-Omni reimplements this function to handle `use_audio_in_video`.
++        """
++        mm_item_counts = mm_items.get_all_counts()
++        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
++
++        use_audio_in_video = False
++        if "video" in mm_kwargs:
++            for item in mm_kwargs["video"]:
++                if item and item["use_audio_in_video"].data:
++                    use_audio_in_video = True
++                else:
++                    use_audio_in_video = False
++
++        if use_audio_in_video and "video" in mm_item_counts:
++            assert "audio" in mm_item_counts
++            mm_item_counts["audio"] -= mm_item_counts["video"]
++
++        # Special case with `use_audio_in_video=True`
++        if use_audio_in_video:
++            if is_update_applied:
++                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
++            (
++                prompt_ids,
++                mm_placeholders,
++            ) = self._apply_prompt_updates(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
++        # normal case with `use_audio_in_video=False`
++        elif is_update_applied:
++            mm_placeholders = self._find_mm_placeholders(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(
++                mm_placeholders,
++                mm_item_counts,
++            )
++        else:
++            (
++                prompt_ids,
++                prompt,
++                mm_placeholders,
++            ) = self._apply_prompt_updates(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(
++                mm_placeholders,
++                mm_item_counts,
++            )
++        tokenizer = self.info.get_tokenizer()
++        prompt = decode_tokens(tokenizer, prompt_ids)
++        return prompt_ids, prompt, mm_placeholders
 +
++    def get_updates_use_audio_in_video(
++        self,
++        thinker_config: PretrainedConfig,
++        audio_len: int,
++        video_grid_thw: Union[list[int], torch.Tensor],
++        video_second_per_grid_t: float,
++    ) -> list[int]:
++        shift = 0
++        audio_token_id = thinker_config.audio_token_id
++        video_token_id = thinker_config.video_token_id
++        audio_start_token_id = thinker_config.audio_start_token_id
++        audio_end_token_id = thinker_config.audio_end_token_id
++        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
++        position_id_per_seconds = thinker_config.position_id_per_seconds
++        audio_token_indices = np.arange(next(iter([audio_len])))
++        curr_video_grid_thw = next(iter([video_grid_thw]))
++        height = curr_video_grid_thw[1] // spatial_merge_size
++        width = curr_video_grid_thw[2] // spatial_merge_size
++        video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
++        video_token_indices = np.broadcast_to(
++            video_token_indices, (video_token_indices.shape[0], height, width)
++        ).reshape(-1)
++        video_token_indices = (
++            (video_token_indices + shift)
++            * next(iter([video_second_per_grid_t]))
++            * position_id_per_seconds
++        )
++        video_data_index, audio_data_index = 0, 0
++        updates = [audio_start_token_id]
++        while video_data_index < len(video_token_indices) and audio_data_index < len(
++            audio_token_indices
++        ):
++            if (
++                video_token_indices[video_data_index]
++                <= audio_token_indices[audio_data_index]
++            ):
++                updates += [video_token_id]
++                video_data_index += 1
++            else:
++                updates += [audio_token_id]
++                audio_data_index += 1
++        if video_data_index < len(video_token_indices):
++            updates += [video_token_id] * (len(video_token_indices) - video_data_index)
++        if audio_data_index < len(audio_token_indices):
++            updates += [audio_token_id] * (len(audio_token_indices) - audio_data_index)
++        updates += [audio_end_token_id]
++        return updates
 +
-+# run disagg. & do exact-match with the outputs from baseline
-+launch_pd
-+launch_pd_proxy
-+sleep 10
++    def _get_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargsItems,
++    ) -> Sequence[PromptUpdate]:
++        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        tokenizer = self.info.get_tokenizer()
++        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
++        vocab = tokenizer.get_vocab()
++
++        audio_token = processor.audio_token
++        image_token = processor.image_token
++        video_token = processor.video_token
++        audio_token_id = vocab[audio_token]
++        image_token_id = vocab[image_token]
++        video_token_id = vocab[video_token]
++
++        out_mm_data = out_mm_kwargs.get_data()
++        audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
++        feature_attention_mask = out_mm_data.get("feature_attention_mask")
++        if audio_feature_lengths is None and feature_attention_mask is None:
++            audio_output_lengths = []
++        elif audio_feature_lengths is not None:
++            _, audio_output_lens = self._get_feat_extract_output_lengths(
++                audio_feature_lengths
++            )
++            audio_output_lengths = audio_output_lens.tolist()
++        elif feature_attention_mask is not None:
++            assert isinstance(feature_attention_mask, torch.Tensor)
++            _, audio_output_lens = self._get_feat_extract_output_lengths(
++                feature_attention_mask.sum(-1)
++            )
++            audio_output_lengths = audio_output_lens.tolist()
 +
-+PREFILL_HOST=${PREFILL_HOST} \
-+PREFILL_PORT=${PREFILL_PORT} \
-+DECODE_HOST=${DECODE_HOST} \
-+DECODE_PORT=${DECODE_PORT} \
-+PROXY_HOST=${PROXY_HOST} \
-+PROXY_PORT=${PROXY_PORT} python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-\ No newline at end of file
-diff --git a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
-new file mode 100644
-index 000000000..00e62f351
---- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
-@@ -0,0 +1,162 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+import argparse
-+import json
-+import os
-+import time
-+
-+import openai
-+import requests
-+
-+MAX_OUTPUT_LEN = 30
-+
-+SAMPLE_PROMPTS = (
-+    "Red Hat is the best company in the world to work for because it works on "
-+    "open source software, which means that all the contributions are "
-+    "delivered to the community. As a result, when working on projects like "
-+    "vLLM we are able to meet many amazing people from various organizations "
-+    "like AMD, Google, NVIDIA, ",
-+    "We hold these truths to be self-evident, that all men are created equal, "
-+    "that they are endowed by their Creator with certain unalienable Rights, "
-+    "that among these are Life, Liberty and the pursuit of Happiness.--That "
-+    "to secure these rights, Governments are instituted among Men, deriving "
-+    "their just powers from the consent of the governed, ",
-+)
++        # number of audios read from video.
++        audio_in_video_item_idx = 0
++        audio_item_idx = 0
 +
++        def get_replacement_qwen2_audio(item_idx: int):
++            nonlocal audio_item_idx
++            item_idx += audio_in_video_item_idx
 +
-+def check_vllm_server(url: str, timeout=5, retries=3) -> bool:
-+    """
-+    Checks if the vLLM server is ready by sending a GET request to the
-+    /health endpoint.
++            audio_item_idx += 1
 +
-+    Args:
-+        url (str): The base URL of the vLLM server.
-+        timeout (int): Timeout in seconds for the request.
-+        retries (int): Number of retries if the server is not ready.
++            num_features = audio_output_lengths[item_idx]
++            if num_features == 0:
++                audios = mm_items.get_items("audio", AudioProcessorItems)
++                audio = audios.get(item_idx)
++                raise ValueError(
++                    f"The audio {audio} (len={len(audio)}) is too short "
++                    "to be represented inside the model"
++                )
 +
-+    Returns:
-+        bool: True if the server is ready, False otherwise.
-+    """
-+    for attempt in range(retries):
-+        try:
-+            response = requests.get(url, timeout=timeout)
-+            if response.status_code == 200:
-+                return True
-+            else:
-+                print(f"Attempt {attempt + 1}: Server returned status code "
-+                      "{response.status_code}")
-+        except requests.exceptions.RequestException as e:
-+            print(f"Attempt {attempt + 1}: Error connecting to server: {e}")
-+        time.sleep(1)  # Wait before retrying
-+    return False
++            return [audio_token_id] * num_features
 +
++        def get_replacement_qwen2_vision(item_idx: int, modality: str):
++            grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
++            assert isinstance(grid_thw, torch.Tensor)
++            merge_length = image_processor.merge_size**2
 +
-+def run_simple_prompt(base_url: str, model_name: str,
-+                      input_prompt: str) -> str:
-+    client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
-+    completion = client.completions.create(model=model_name,
-+                                           prompt=input_prompt,
-+                                           max_tokens=MAX_OUTPUT_LEN,
-+                                           temperature=0.0,
-+                                           seed=42)
++            token_id = image_token_id if modality == "image" else video_token_id
++            return [token_id] * (int(grid_thw.prod()) // merge_length)
 +
-+    # print("-" * 50)
-+    # print(f"Completion results for {model_name}:")
-+    # print(completion)
-+    # print("-" * 50)
-+    return completion.choices[0].text
++        use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False)
++        thinker_config = self.info.get_hf_config()
 +
++        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
++            nonlocal audio_in_video_item_idx
++            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
++            video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 +
-+def main():
-+    """
-+    This script demonstrates how to accept two optional string arguments
-+    ("service_url" and "file_name") from the command line, each with a
-+    default value of an empty string, using the argparse module.
-+    """
-+    parser = argparse.ArgumentParser(description="vLLM client script")
++            audio_in_video_item_idx += 1
 +
-+    parser.add_argument(
-+        "--service_url",  # Name of the first argument
-+        type=str,
-+        required=True,
-+        help="The vLLM service URL.")
++            second_per_grid_ts = hf_processor_mm_kwargs.get("second_per_grid_ts", None)
++            if second_per_grid_ts:
++                video_second_per_grid_t = second_per_grid_ts[item_idx]
++            else:
++                video_second_per_grid_t = 1.0
 +
-+    parser.add_argument(
-+        "--model_name",  # Name of the first argument
-+        type=str,
-+        required=True,
-+        help="model_name",
-+    )
++            return self.get_updates_use_audio_in_video(
++                thinker_config=thinker_config,
++                audio_len=audio_num_features,
++                video_grid_thw=video_grid_thw,
++                video_second_per_grid_t=video_second_per_grid_t,
++            )
 +
-+    parser.add_argument(
-+        "--mode",  # Name of the second argument
-+        type=str,
-+        default="baseline",
-+        help="mode: baseline==non-disagg, or disagg",
-+    )
++        video_replacement_fn = (
++            get_replacement_qwen2_use_audio_in_video
++            if use_audio_in_video
++            else partial(get_replacement_qwen2_vision, modality="video")
++        )
 +
-+    parser.add_argument(
-+        "--file_name",  # Name of the second argument
-+        type=str,
-+        default=".vllm_output.txt",
-+        help="the file that saves the output tokens ",
-+    )
++        return [
++            PromptReplacement(
++                modality="audio",
++                target=audio_token,
++                replacement=get_replacement_qwen2_audio,
++            ),
++            PromptReplacement(
++                modality="image",
++                target=image_token,
++                replacement=partial(get_replacement_qwen2_vision, modality="image"),
++            ),
++            PromptReplacement(
++                modality="video",
++                target=video_token,
++                replacement=video_replacement_fn,
++            ),
++        ]
 +
-+    args = parser.parse_args()
-+
-+    for arg in vars(args):
-+        print(f"{arg}: {getattr(args, arg)}")
-+
-+    if args.mode == "baseline":
-+        # non-disagg
-+        health_check_url = f"{args.service_url}/health"
-+    else:
-+        # disagg proxy
-+        health_check_url = f"{args.service_url}/healthcheck"
-+        if not os.path.exists(args.file_name):
-+            raise ValueError(
-+                f"In disagg mode, the output file {args.file_name} from "
-+                "non-disagg. baseline does not exist.")
-+
-+    service_url = f"{args.service_url}/v1"
-+
-+    if not check_vllm_server(health_check_url):
-+        raise RuntimeError(
-+            f"vllm server: {args.service_url} is not ready yet!")
-+
-+    output_strs = dict()
-+    for prompt in SAMPLE_PROMPTS:
-+        output_str = run_simple_prompt(base_url=service_url,
-+                                       model_name=args.model_name,
-+                                       input_prompt=prompt)
-+        print(f"Prompt: {prompt}, output: {output_str}")
-+        output_strs[prompt] = output_str
-+
-+    if args.mode == "baseline":
-+        # baseline: save outputs
-+        try:
-+            with open(args.file_name, 'w') as json_file:
-+                json.dump(output_strs, json_file, indent=4)
-+        except OSError as e:
-+            print(f"Error writing to file: {e}")
-+            raise
-+    else:
-+        # disagg. verify outputs
-+        baseline_outputs = None
-+        try:
-+            with open(args.file_name) as json_file:
-+                baseline_outputs = json.load(json_file)
-+        except OSError as e:
-+            print(f"Error writing to file: {e}")
-+            raise
-+        assert isinstance(baseline_outputs, dict)
-+        assert len(baseline_outputs) == len(output_strs)
-+        for prompt, output in baseline_outputs.items():
-+            assert prompt in output_strs, f"{prompt} not included"
-+            assert output == output_strs[prompt], (
-+                f"baseline_output: {output} != PD output: {output_strs[prompt]}"
-+            )
++    def _validate_mm_placeholders(
++        self,
++        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
++        mm_item_counts: Mapping[str, int],
++    ) -> None:
++        BaseMultiModalProcessor[
++            Qwen2_5OmniThinkerProcessingInfo
++        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
 +
++    def _get_raw_input_ids(
++        self,
++        token_ids: list[int],
++        use_audio_in_video: bool = False,
++    ) -> list[int]:
++        tokenizer = self.info.get_tokenizer()
++        vision_bos_token = tokenizer.encode(tokenizer.vision_bos_token)[0]
++        vision_eos_token = tokenizer.encode(tokenizer.vision_eos_token)[0]
++        audio_bos_token = tokenizer.encode(tokenizer.audio_bos_token)[0]
++        audio_eos_token = tokenizer.encode(tokenizer.audio_eos_token)[0]
++        audio_token = tokenizer.encode("<|audio_pad|>")[0]
++        image_token = tokenizer.encode("<|image_pad|>")[0]
++        video_token = tokenizer.encode("<|video_pad|>")[0]
++
++        result = token_ids[:]
++        if use_audio_in_video:
++            while True:
++                start = None
++                for i in range(len(result) - 1):
++                    if result[i : i + 2] == [vision_bos_token, audio_bos_token]:
++                        start = i
++                        break
++                if start is not None:
++                    end = None
++                    for i in range(start + 2, len(result) - 1):
++                        if result[i : i + 2] == [audio_eos_token, vision_eos_token]:
++                            end = i
++                            break
++                    if end is not None:
++                        result = (
++                            result[:start]
++                            + [vision_bos_token, video_token, vision_eos_token]
++                            + result[end + 2 :]
++                        )
++                else:
++                    break
 +
-+if __name__ == "__main__":
-+    main()
-diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-index 95465a25f..8439e30be 100644
---- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-@@ -4,8 +4,11 @@ import os
- 
- import openai
- 
-+PREFILL_HOST = os.getenv("PREFILL_HOST", "localhost")
- PREFILL_PORT = os.getenv("PREFILL_PORT", None)
-+DECODE_HOST = os.getenv("DECODE_HOST", "localhost")
- DECODE_PORT = os.getenv("DECODE_PORT", None)
-+PROXY_HOST = os.getenv("PROXY_HOST", "localhost")
- PROXY_PORT = os.getenv("PROXY_PORT", None)
- 
- if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
-@@ -21,15 +24,15 @@ def test_edge_cases():
-     # Set the OpenAI API key and base URL
-     decode_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{DECODE_PORT}/v1",
-+        base_url=f"http://{DECODE_HOST}:{DECODE_PORT}/v1",
-     )
-     prefill_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{PREFILL_PORT}/v1",
-+        base_url=f"http://{PREFILL_HOST}:{PREFILL_PORT}/v1",
-     )
-     proxy_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{PROXY_PORT}/v1",
-+        base_url=f"http://{PROXY_HOST}:{PROXY_PORT}/v1",
-     )
- 
-     # Get the list of models
-diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-index c58cb0286..66e237da0 100644
---- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-@@ -3,6 +3,7 @@
- 
- import argparse
- import itertools
-+import logging
- import os
- import uuid
- from contextlib import asynccontextmanager
-@@ -11,9 +12,8 @@ import httpx
- from fastapi import FastAPI, Request
- from fastapi.responses import StreamingResponse
- 
--from vllm.logger import init_logger
--
--logger = init_logger(__name__)
-+logger = logging.getLogger(__name__)
-+logger.setLevel(logging.DEBUG)
- 
- 
- @asynccontextmanager
-diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
-index b4d4348c7..cc59287a9 100644
---- a/tests/v1/test_oracle.py
-+++ b/tests/v1/test_oracle.py
-@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
-     "openai/whisper-large-v3",  # transcription
-     "facebook/bart-large-cnn",  # encoder decoder
-     "state-spaces/mamba-130m-hf",  # mamba1
--    "BAAI/bge-m3",  # embedding
- ]
- 
- MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
-index 0b892bd9d..00d98a873 100644
---- a/tests/v1/test_utils.py
-+++ b/tests/v1/test_utils.py
-@@ -1,9 +1,8 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- 
--import re
--
- import pytest
-+import regex as re
- import requests
- import torch
- 
-diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
-index cf296a3b5..6e60c1ccc 100644
---- a/vllm/_custom_ops.py
-+++ b/vllm/_custom_ops.py
-@@ -1282,7 +1282,10 @@ def scaled_fp8_quant(
-                 output, input.contiguous(), scale, scale_ub)
-         else:
-             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
--            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-+            if current_platform.is_xpu():
-+                torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
-+            else:
-+                torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-     else:
-         assert scale.numel() == 1, f"{scale.shape}"
-         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
-diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
-index 7533bf5ef..283b19900 100644
---- a/vllm/_ipex_ops.py
-+++ b/vllm/_ipex_ops.py
-@@ -6,6 +6,8 @@ from typing import Optional
- import torch
- 
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
-+from vllm.utils import direct_register_custom_op
- 
- logger = init_logger(__name__)
- 
-@@ -14,6 +16,8 @@ try:
- except ImportError as e:
-     logger.warning("Import error msg: %s", e.msg)
- 
-+import vllm._C.ops
-+
- 
- class ipex_ops:
- 
-@@ -77,7 +81,8 @@ class ipex_ops:
-         assert kv_cache_dtype == "auto"
-         num_heads = out.size(1)
-         num_queries_per_tokens = num_heads // num_kv_heads
--        ipex.llm.modules.PagedAttention.single_query_kv_attention(
-+        #ipex.llm.modules.PagedAttention.single_query_kv_attention(
-+        torch.xpu.paged_attention_v1(
-             out,
-             query.contiguous(),
-             key_cache.view_as(value_cache),
-@@ -142,10 +147,12 @@ class ipex_ops:
-         cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
-         is_neox: bool,
-     ) -> None:
--        rot_dim = cos_sin_cache.size(1)
--        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
--                                                     head_size, cos_sin_cache,
--                                                     is_neox, rot_dim)
-+        # rot_dim = cos_sin_cache.size(1)
-+        # ipex.llm.functional.rotary_embedding_batched(positions, query, key,
-+        #                                              head_size, cos_sin_cache,
-+        #                                              is_neox, rot_dim)
-+        import vllm._C.ops
-+        vllm._C.ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
- 
-     @staticmethod
-     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
-@@ -161,14 +168,18 @@ class ipex_ops:
-     @staticmethod
-     def rms_norm(input: torch.Tensor, weight: torch.Tensor,
-                  epsilon: float) -> torch.Tensor:
--        return ipex.llm.functional.rms_norm(input, weight, epsilon)
-+        # return ipex.llm.functional.rms_norm(input, weight, epsilon)
-+        tmp = torch.empty_like(input)
-+        vllm._C.ops.rms_norm(tmp, input, weight, epsilon)
-+        return tmp
- 
-     @staticmethod
-     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
-                            weight: torch.Tensor, epsilon: float) -> None:
--        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
--                                               epsilon, True)
--        input.copy_(tmp)
-+        # tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
-+        #                                        epsilon, True)
-+        # input.copy_(tmp)
-+        vllm._C.ops.fused_add_rms_norm(input, residual, weight, epsilon)
- 
-     @staticmethod
-     def varlen_attention(
-@@ -348,3 +359,166 @@ class ipex_ops:
-     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
-                     block_mapping: torch.Tensor) -> None:
-         torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
-+
-+    @staticmethod
-+    def _bgmv_shrink(inputs: torch.Tensor,
-+                     lora_a_weights: torch.Tensor,
-+                     output_tensor: torch.Tensor,
-+                     lora_indices_tensor: torch.Tensor,
-+                     scaling: float = 1.0) -> None:
-+
-+        ipex.llm.functional.bgmv_shrink(inputs, lora_a_weights, output_tensor,
-+                                        lora_indices_tensor, scaling)
-+
-+    def _bgmv_shrink_fake(inputs: torch.Tensor,
-+                          lora_a_weights: torch.Tensor,
-+                          output_tensor: torch.Tensor,
-+                          lora_indices_tensor: torch.Tensor,
-+                          scaling: float = 1.0) -> None:
-+        pass
-+
-+    @staticmethod
-+    def bgmv_expand(inputs: torch.Tensor,
-+                    lora_b_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    add_inputs: bool = True) -> None:
-+        ipex.llm.functional.bgmv_expand(inputs, lora_b_weights, output_tensor,
-+                                        lora_indices_tensor, add_inputs)
-+
-+    @staticmethod
-+    def _bgmv_expand_slice(inputs: torch.Tensor,
-+                           lora_b_weights: torch.Tensor,
-+                           output_tensor: torch.Tensor,
-+                           lora_indices_tensor: torch.Tensor,
-+                           slice_offset: int,
-+                           slice_size: int,
-+                           add_inputs: bool = True) -> None:
-+        ipex.llm.functional.bgmv_expand_slice(inputs, lora_b_weights,
-+                                              output_tensor,
-+                                              lora_indices_tensor,
-+                                              slice_offset, slice_size,
-+                                              add_inputs)
-+
-+    def _bgmv_expand_slice_fake(inputs: torch.Tensor,
-+                                lora_b_weights: torch.Tensor,
-+                                output_tensor: torch.Tensor,
-+                                lora_indices_tensor: torch.Tensor,
-+                                slice_offset: int,
-+                                slice_size: int,
-+                                add_inputs: bool = True) -> None:
-+        pass
-+
-+    @staticmethod
-+    def sgmv_shrink(inputs: torch.Tensor,
-+                    lora_a_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    b_seq_start_loc: torch.Tensor,
-+                    seq_len_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    batches: int,
-+                    max_seq_length: int,
-+                    token_nums: int,
-+                    scaling: float = 1.0) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_shrink(inputs, lora_a_weights, output_tensor,
-+                                        b_seq_start_loc, seq_len_tensor,
-+                                        lora_indices_tensor, batches,
-+                                        max_seq_length, scaling)
-+
-+    @staticmethod
-+    def sgmv_expand(inputs: torch.Tensor,
-+                    lora_b_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    b_seq_start_loc: torch.Tensor,
-+                    seq_len_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    batches: int,
-+                    max_seq_length: int,
-+                    token_nums: int,
-+                    add_inputs: bool = False) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_expand(inputs, lora_b_weights, output_tensor,
-+                                        b_seq_start_loc, seq_len_tensor,
-+                                        lora_indices_tensor, batches,
-+                                        max_seq_length, add_inputs)
-+
-+    @staticmethod
-+    def sgmv_expand_slice(inputs: torch.Tensor,
-+                          lora_b_weights: torch.Tensor,
-+                          output_tensor: torch.Tensor,
-+                          b_seq_start_loc: torch.Tensor,
-+                          seq_len_tensor: torch.Tensor,
-+                          lora_indices_tensor: torch.Tensor,
-+                          batches: int,
-+                          max_seq_length: int,
-+                          token_nums: int,
-+                          slice_offset: int,
-+                          slice_size: int,
-+                          add_inputs: bool = False) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_expand_slice(inputs, lora_b_weights,
-+                                              output_tensor, b_seq_start_loc,
-+                                              seq_len_tensor,
-+                                              lora_indices_tensor, batches,
-+                                              max_seq_length, slice_offset,
-+                                              slice_size, add_inputs)
-+
-+    # @staticmethod
-+    # def lora_expand(inputs: torch.Tensor,
-+    #                 lora_b_weights: List[torch.Tensor],
-+    #                 output_tensor: torch.Tensor,
-+    #                 token_lora_mapping: torch.Tensor,
-+    #                 token_indices_sorted_by_lora_ids: torch.Tensor,
-+    #                 num_tokens_per_lora: torch.Tensor,
-+    #                 lora_token_start_loc: torch.Tensor,
-+    #                 lora_ids: torch.Tensor,
-+    #                 offset_start: int = 0,
-+    #                 add_inputs: bool = False) -> None:
-+    #     ipex.llm.functional.lora_expand(inputs, lora_b_weights,
-+    #                                     output_tensor, token_lora_mapping,
-+    #                                     token_indices_sorted_by_lora_ids,
-+    #                                     num_tokens_per_lora, num_tokens_per_lora, #no_qa
-+    #                                     lora_token_start_loc, lora_ids,
-+    #                                     offset_start, add_inputs)
-+
-+    # @staticmethod
-+    # def lora_shrink(inputs: torch.Tensor,
-+    #                 lora_a_weights: List[torch.Tensor],
-+    #                 output_tensor: torch.Tensor,
-+    #                 token_lora_mapping: torch.Tensor,
-+    #                 token_indices_sorted_by_lora_ids: torch.Tensor,
-+    #                 num_tokens_per_lora: torch.Tensor,
-+    #                 lora_token_start_loc: torch.Tensor,
-+    #                 lora_ids: torch.Tensor,
-+    #                 scaling: float) -> None:
-+    #     ipex.llm.functional.lora_shrink(inputs, lora_a_weights,
-+    #                                     output_tensor, token_lora_mapping,
-+    #                                     token_indices_sorted_by_lora_ids,
-+    #                                     num_tokens_per_lora, num_tokens_per_lora,
-+    #                                     lora_token_start_loc, lora_ids,
-+    #                                     scaling)
++        for mm_token in [audio_token, image_token, video_token]:
++            compressed = []
++            for x in result:
++                if x != mm_token or (not compressed or compressed[-1] != mm_token):
++                    compressed.append(x)
++            result = compressed
 +
++        return result
 +
-+try:
-+    direct_register_custom_op(
-+        op_name="bgmv_shrink",
-+        op_func=ipex_ops._bgmv_shrink,
-+        mutates_args=["output_tensor"],
-+        fake_impl=ipex_ops._bgmv_shrink_fake,
-+        dispatch_key=current_platform.dispatch_key,
-+    )
-+    bgmv_shrink = torch.ops.vllm.bgmv_shrink
-+
-+    direct_register_custom_op(
-+        op_name="bgmv_expand_slice",
-+        op_func=ipex_ops._bgmv_expand_slice,
-+        mutates_args=["output_tensor"],
-+        fake_impl=ipex_ops._bgmv_expand_slice_fake,
-+        dispatch_key=current_platform.dispatch_key,
-+    )
-+    bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-+
-+except AttributeError:
-+    bgmv_shrink = ipex_ops._bgmv_shrink
-+    bgmv_expand_slice = ipex_ops._bgmv_expand_slice
-diff --git a/vllm/assets/video.py b/vllm/assets/video.py
-index 8ab0e9760..6d09c912a 100644
---- a/vllm/assets/video.py
-+++ b/vllm/assets/video.py
-@@ -94,6 +94,24 @@ def video_get_metadata(path: str) -> dict[str, Any]:
-     return metadata
- 
- 
-+def video_get_metadata(path: str) -> dict[str, Any]:
-+    cap = cv2.VideoCapture(path)
-+    if not cap.isOpened():
-+        raise ValueError(f"Could not open video file {path}")
-+
-+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-+    fps = cap.get(cv2.CAP_PROP_FPS)
-+    duration = total_frames / fps if fps > 0 else 0
-+
-+    metadata = {
-+        "total_num_frames": total_frames,
-+        "fps": fps,
-+        "duration": duration,
-+        "video_backend": "opencv"
-+    }
-+    return metadata
 +
++class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin):
++    def _validate_and_reshape_mm_tensor(
++        self, mm_input: object, name: str, dim: int = 0
++    ) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. Got type: {type(mm_input)}")
++        if name == "feature_attention_mask":
++            dim = -1
++        if isinstance(mm_input, torch.Tensor):
++            return torch.concat(list(mm_input), dim=dim)
++        else:
++            if isinstance(mm_input[0], list):
++                return torch.concat(
++                    [torch.concat(mm_input[i], dim=dim) for i in range(len(mm_input))],
++                    dim=dim,
++                )
++            else:
++                return torch.concat(mm_input, dim=dim)
 +
- VideoAssetName = Literal["baby_reading"]
- 
- 
-diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
-index 178453ecd..8b350cd0d 100644
---- a/vllm/attention/layer.py
-+++ b/vllm/attention/layer.py
-@@ -185,7 +185,7 @@ class Attention(nn.Module):
-         # opaque custom op. For other platforms, we directly call them
-         # and let torch.compile handle them.
-         self.use_direct_call = not current_platform.is_cuda_alike(
--        ) and not current_platform.is_cpu()
-+        ) and not current_platform.is_cpu() and not current_platform.is_xpu()
- 
-         self.use_output = attn_backend.accept_output_buffer
-         compilation_config = get_current_vllm_config().compilation_config
-@@ -431,6 +431,87 @@ def maybe_save_kv_layer_to_connector(
-                             attn_metadata[layer_name])
- 
- 
-+class SelfMultiHeadAttention(nn.Module):
-+    """Multi-headed attention without any cache, used for ViT."""
++    def _get_feat_extract_output_lengths(
++        self, input_lengths: torch.Tensor
++    ) -> torch.Tensor:
++        input_lengths_leave = input_lengths % 100
++        feat_lengths = (input_lengths_leave - 1) // 2 + 1
++        output_lengths = (
++            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++        )
++        return output_lengths, output_lengths
 +
-+    def __init__(
++    def _process_audio_input(
 +        self,
-+        num_heads: int,
-+        head_size: int,
-+        scale: float,
-+        num_kv_heads: Optional[int] = None,
-+    ):
-+        super().__init__()
-+        self.num_heads = num_heads
-+        self.head_size = head_size
-+        self.scale = scale
-+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++        audio_input: Qwen2AudioFeatureInputs,
++        audio_hashes: list[str] = None,
++        cached_audio_features: torch.Tensor = None,
++    ) -> torch.Tensor:
++        input_features = audio_input["input_features"]
++        audio_feature_lengths = audio_input["audio_feature_lengths"]
 +
-+        assert self.num_heads % self.num_kv_heads == 0
-+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++        if input_features.ndim == 3:
++            assert input_features.shape[0] == 1
++            input_features = input_features.squeeze(0)
 +
-+        self.attn_backend =  _Backend.TORCH_SDPA
++        if not isinstance(audio_feature_lengths, torch.Tensor):
++            audio_feature_lengths = torch.cat(audio_feature_lengths)
++        if audio_feature_lengths.ndim == 2:
++            audio_feature_lengths = audio_feature_lengths.reshape(-1)
 +
-+    def forward(
-+        self,
-+        query: torch.Tensor,
-+        key: torch.Tensor,
-+        value: torch.Tensor,
-+    ) -> torch.Tensor:
-+        """Input shape: batch_size x seq_len x hidden_size"""
-+        # TODO(Isotr0py): Use existing backend implementations and support FA3
-+        bsz, q_len, _ = query.size()
-+        kv_len = key.size(1)
-+
-+        q = query.view(bsz, q_len, self.num_heads, self.head_size)
-+        k = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-+        v = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
-+        from einops import rearrange
-+        q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+        from vllm._ipex_ops import ipex_ops
-+        output = torch.empty(
-+                    (q.shape[0], q.shape[1], q.shape[2]),
-+                    dtype=q.dtype,
-+                    device=q.device)
-+        import math
-+        head_dim = q.shape[-1]
-+        scale = 1 / math.sqrt(head_dim) if self.scale is None else self.scale
-+        tmp = [0]
-+        tmp.append(q_len)
-+        seqlen = torch.tensor(tmp)
-+        cu_seqlens = torch.cumsum(seqlen, dim=0).to(device=query.device)
-+        max_seqlen = q_len
-+        ipex_ops.varlen_attention(q, k, v, output,
-+                                cu_seqlens,
-+                                cu_seqlens,
-+                                None,
-+                                max_seqlen,
-+                                max_seqlen,
-+                                pdropout=0,
-+                                softmax_scale=scale,
-+                                zero_tensors=False,
-+                                is_causal=False,
-+                                return_softmax=False,
-+                                window_size_left=-1,
-+                                window_size_right=-1,
-+                                gen_=None,
-+                                logits_soft_cap=0
-+                                )
-+
-+        # out = rearrange(output,
-+        #                             "(b s) ... -> b s ...",
-+        #                             b=batch_size)
-+        # query, key, value = (x.transpose(1, 2)
-+        #                         for x in (query, key, value))
-+        # out = F.scaled_dot_product_attention(query,
-+        #                                         key,
-+        #                                         value,
-+        #                                         scale=self.scale)
-+        # out = out.transpose(1, 2)
-+
-+        return output.reshape(bsz, q_len, -1)
-+
-+
- def unified_attention(
-     query: torch.Tensor,
-     key: torch.Tensor,
-diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
-index 673fb5866..65dfce992 100644
---- a/vllm/compilation/backends.py
-+++ b/vllm/compilation/backends.py
-@@ -414,7 +414,7 @@ class VllmBackend:
-         self.prefix = prefix or model_tag
- 
-         global global_graph_pool
--        if global_graph_pool is None:
-+        if global_graph_pool is None and not current_platform.is_xpu():
-             global_graph_pool = current_platform.graph_pool_handle()
- 
-         # TODO: in the future, if we want to use multiple
-diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
-index 286221d32..b15ed7d7a 100644
---- a/vllm/compilation/fix_functionalization.py
-+++ b/vllm/compilation/fix_functionalization.py
-@@ -9,6 +9,7 @@ import torch
- from torch._higher_order_ops.auto_functionalize import auto_functionalized
- 
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
- 
- from .fx_utils import is_func
- from .vllm_inductor_pass import VllmInductorPass
-@@ -32,6 +33,8 @@ class FixFunctionalizationPass(VllmInductorPass):
-         self.nodes_to_remove: list[torch.fx.Node] = []
-         count = 0
-         for node in graph.nodes:
-+            if current_platform.is_xpu():
-+                continue
-             if not is_func(node, auto_functionalized):
-                 continue  # Avoid deep if-elif nesting
- 
-diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
-index 58216a1f0..27e634532 100644
---- a/vllm/compilation/pass_manager.py
-+++ b/vllm/compilation/pass_manager.py
-@@ -5,12 +5,15 @@ from torch import fx as fx
- 
- from vllm.config import VllmConfig
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
++        audio_feat_lengths, audio_output_lengths = (
++            self._get_feat_extract_output_lengths(audio_feature_lengths)
++        )
 +
-+if current_platform.is_cuda_alike():
-+    from .fusion import FusionPass
-+    from .collective_fusion import AllReduceFusionPass, AsyncTPPass
-+    from .fusion_attn import AttnFusionPass
- 
- from .activation_quant_fusion import ActivationQuantFusionPass
--from .collective_fusion import AllReduceFusionPass, AsyncTPPass
- from .fix_functionalization import FixFunctionalizationPass
--from .fusion import FusionPass
--from .fusion_attn import AttnFusionPass
- from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
- from .noop_elimination import NoOpEliminationPass
- from .sequence_parallelism import SequenceParallelismPass
-@@ -18,6 +21,11 @@ from .vllm_inductor_pass import VllmInductorPass
- 
- logger = init_logger(__name__)
- 
-+try:
-+    from .fusion import FusionPass
-+except AttributeError:
-+    logger.warning("import FusionPass error.")
++        audio_outputs = self.audio_tower(
++            input_features.to(self.audio_tower.dtype),
++            feature_lens=audio_feature_lengths,
++            aftercnn_lens=audio_feat_lengths,
++        )
++        audio_features = audio_outputs.last_hidden_state
++        return audio_features.split(audio_output_lengths.tolist())
 +
- 
- class PostGradPassManager(CustomGraphPass):
-     """
-@@ -56,7 +64,7 @@ class PostGradPassManager(CustomGraphPass):
-             if self.pass_config.enable_async_tp:
-                 self.passes += [AsyncTPPass(config)]
- 
--        if self.pass_config.enable_fusion:
-+        if self.pass_config.enable_fusion and not current_platform.is_xpu():
-             self.passes += [FusionPass.instance(config)]
-             self.passes += [ActivationQuantFusionPass(config)]
- 
-diff --git a/vllm/config.py b/vllm/config.py
-index f038cdd64..8b79d6728 100644
---- a/vllm/config.py
-+++ b/vllm/config.py
-@@ -1719,6 +1719,10 @@ class CacheConfig:
-             raise ValueError("CPU offload space must be non-negative"
-                              f", but got {self.cpu_offload_gb}")
- 
-+        if self.cpu_offload_gb > 0 and envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
-+            raise ValueError("CPU offload can't work together with"
-+                             "OFFLOAD_WEIGHTS_BEFORE_QUANT")
-+
-         if self.gpu_memory_utilization > 1.0:
-             raise ValueError(
-                 "GPU memory utilization must be less than 1.0. Got "
-@@ -2730,6 +2734,14 @@ class SpeculativeConfig:
-                 "architectures": ["Glm4MoeMTPModel"]
-             })
- 
-+        if hf_config.architectures[0] == "Glm4MoeForCausalLM":
-+            hf_config.model_type = "glm4_moe_mtp"
-+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-+            hf_config.update({
-+                "num_hidden_layers": 0,
-+                "n_predict": n_predict,
-+                "architectures": ["Glm4MoeMTPModel"]
-+            })
-         return hf_config
- 
-     def __post_init__(self):
-diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
-index dee5ed7a2..9bbfee118 100644
---- a/vllm/distributed/device_communicators/xpu_communicator.py
-+++ b/vllm/distributed/device_communicators/xpu_communicator.py
-@@ -7,8 +7,13 @@ import torch
- import torch.distributed as dist
- from torch.distributed import ProcessGroup
- 
-+import vllm.envs as envs
-+from vllm.logger import init_logger
 +
- from .base_device_communicator import DeviceCommunicatorBase
- 
-+logger = init_logger(__name__)
++@MULTIMODAL_REGISTRY.register_processor(
++    Qwen3OmniMoeThinkerMultiModalProcessor,
++    info=Qwen3OmniMoeThinkerProcessingInfo,
++    dummy_inputs=Qwen3OmniMoeThinkerDummyInputsBuilder,
++)
++class Qwen3OmniMoeThinkerForConditionalGeneration(
++    nn.Module,
++    SupportsMultiModal,
++    SupportsPP,
++    Qwen3OmniMoeConditionalGenerationMixin,
++):
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "thinker.lm_head.": "language_model.lm_head.",
++            "thinker.model.": "language_model.model.",
++            "thinker.": "",
++        }
++    )
 +
- 
- class XpuCommunicator(DeviceCommunicatorBase):
- 
-@@ -18,6 +23,14 @@ class XpuCommunicator(DeviceCommunicatorBase):
-                  device_group: Optional[ProcessGroup] = None,
-                  unique_name: str = ""):
-         super().__init__(cpu_group, device, device_group, unique_name)
-+        if self.use_all2all:
-+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-+            if all2all_backend == "naive":
-+                from .all2all import NaiveAll2AllManager
-+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
-+                logger.info("Using naive all2all manager.")
-+            else:
-+                raise ValueError(f"Unknown all2all backend: {all2all_backend}")
- 
-     def all_reduce(self, input_) -> torch.Tensor:
-         dist.all_reduce(input_, group=self.device_group)
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-index e1245775b..8bbdd7e06 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-@@ -32,7 +32,7 @@ The class provides the following primitives:
- 
- import enum
- from abc import ABC, abstractmethod
--from typing import TYPE_CHECKING, Any, Optional
-+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
- 
- import torch
- 
-@@ -46,6 +46,12 @@ if TYPE_CHECKING:
-     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
-     from vllm.v1.request import Request
- 
-+# s_tensor_list, d_tensor_list, s_indices, d_indices, direction
-+CopyBlocksOp = Callable[[
-+    dict[str, torch.Tensor], dict[
-+        str, torch.Tensor], list[int], list[int], Literal["h2d", "d2h"]
-+], None]
++    @classmethod
++    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
++        if modality.startswith("image"):
++            return "<|vision_start|><|image_pad|><|vision_end|>"
++        if modality.startswith("video"):
++            return "<|vision_start|><|video_pad|><|vision_end|>"
++        if modality.startswith("audio"):
++            return "<|audio_start|><|audio_pad|><|audio_end|>"
 +
- logger = init_logger(__name__)
- 
- 
-@@ -127,6 +133,13 @@ class KVConnectorBase_V1(ABC):
-         """
-         return
- 
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        """
-+        Set the xPU-specific ops for copying KV between host and device.
-+        Needed when host buffer is used for kv transfer (e.g., in NixlConnector)
-+        """
-+        return
-+
-     @abstractmethod
-     def start_load_kv(self, forward_context: "ForwardContext",
-                       **kwargs) -> None:
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-index 0c5986bfa..c06cda356 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-@@ -1,6 +1,7 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- import contextlib
-+import logging
- import math
- import queue
- import threading
-@@ -20,14 +21,14 @@ from vllm import envs
- from vllm.attention.selector import backend_name_to_enum, get_attn_backend
- from vllm.config import VllmConfig
- from vllm.distributed.kv_transfer.kv_connector.v1.base import (
--    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
-+    CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
- from vllm.distributed.parallel_state import (
-     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
-     get_tp_group)
- from vllm.distributed.utils import divide
- from vllm.forward_context import ForwardContext
- from vllm.logger import init_logger
--from vllm.platforms import _Backend
-+from vllm.platforms import _Backend, current_platform
- from vllm.utils import make_zmq_path, make_zmq_socket, round_down
- from vllm.v1.core.sched.output import SchedulerOutput
- from vllm.v1.request import RequestStatus
-@@ -40,6 +41,7 @@ if TYPE_CHECKING:
- Transfer = tuple[int, float]  # (xfer_handle, start_time)
- EngineId = str
- ReqId = str
-+
- GET_META_MSG = b"get_meta_msg"
- 
- logger = init_logger(__name__)
-@@ -52,6 +54,13 @@ except ImportError:
-     logger.warning("NIXL is not available")
-     NixlWrapper = None
- 
-+# Supported xPUs and types of kv transfer buffer.
-+# {xPU: tuple of supported kv buffer types}
-+_NIXL_SUPPORTED_XPUS = {
-+    "cuda": ("cuda", ),
-+    "tpu": ("cpu", ),
-+}
++        raise ValueError("Only image, video or audio modality is supported")
 +
- 
- class NixlAgentMetadata(
-         msgspec.Struct,
-@@ -80,6 +89,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
- 
-     def __init__(self):
-         self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
-+        self.reqs_to_save: dict[ReqId, ReqMeta] = {}
-         self.reqs_to_send: dict[ReqId, float] = {}
- 
-     def add_new_req(
-@@ -87,8 +97,12 @@ class NixlConnectorMetadata(KVConnectorMetadata):
-         request_id: ReqId,
-         local_block_ids: list[int],
-         kv_transfer_params: dict[str, Any],
-+        load_remote_cache: bool = True,
-+        save_to_host: bool = False,
-     ):
--        self.reqs_to_recv[request_id] = ReqMeta(
-+        # save and load are mutually exclusive
-+        assert load_remote_cache ^ save_to_host
-+        _req = ReqMeta(
-             local_block_ids=local_block_ids,
-             remote_block_ids=kv_transfer_params["remote_block_ids"],
-             remote_engine_id=kv_transfer_params["remote_engine_id"],
-@@ -97,6 +111,10 @@ class NixlConnectorMetadata(KVConnectorMetadata):
-             # P workers don't need to receive tp_size from proxy here.
-             tp_size=kv_transfer_params.get("tp_size", 1),
-         )
-+        if save_to_host:
-+            self.reqs_to_save[request_id] = _req
-+        if load_remote_cache:
-+            self.reqs_to_recv[request_id] = _req
- 
- 
- class NixlConnector(KVConnectorBase_V1):
-@@ -155,6 +173,10 @@ class NixlConnector(KVConnectorBase_V1):
-         assert self.connector_worker is not None
-         self.connector_worker.register_kv_caches(kv_caches)
- 
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        assert self.connector_worker is not None
-+        self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
-+
-     def get_finished(self,
-                      finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
-         """Get the finished recving and sending requests."""
-@@ -177,8 +199,11 @@ class NixlConnector(KVConnectorBase_V1):
-         pass
- 
-     def wait_for_save(self):
--        """NixlConnector does not save explicitly."""
--        pass
-+        assert self.connector_worker is not None
-+        assert isinstance(self._connector_metadata, NixlConnectorMetadata)
-+        if self.connector_worker.use_host_buffer and \
-+           self.connector_worker.copy_blocks:
-+            self.connector_worker.save_kv_to_host(self._connector_metadata)
- 
- 
- class NixlConnectorScheduler:
-@@ -193,12 +218,15 @@ class NixlConnectorScheduler:
-             envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-             vllm_config.parallel_config.data_parallel_rank *
-             vllm_config.parallel_config.tensor_parallel_size)
-+        self.use_host_buffer = \
-+            vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
-         logger.info("Initializing NIXL Scheduler %s", engine_id)
- 
-         # Requests that need to start recv/send.
-         # New requests are added by update_state_after_alloc in
-         # the scheduler. Used to make metadata passed to Worker.
-         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
-+        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
-         # Reqs to send and their expiration time
-         self._reqs_need_send: dict[ReqId, float] = {}
- 
-@@ -248,7 +276,25 @@ class NixlConnectorScheduler:
-             "num_external_tokens=%s, kv_transfer_params=%s",
-             num_external_tokens, params)
- 
--        if params is not None and params.get("do_remote_prefill"):
-+        if not params:
-+            return
-+        if self.use_host_buffer and params.get("do_remote_decode"):
-+            # NOTE: when accelerator is not directly supported by Nixl,
-+            # prefilled blocks need to be saved to host memory before transfer.
-+
-+            # figure out full computed blocks to save
-+            block_ids = blocks.get_block_ids()[0]
-+            all_full = request.num_tokens % self.block_size == 0
-+            full_block_ids = (block_ids if all_full else block_ids[:-1])
-+            # TODO: skip the blocks that are already in the host xfer buffer.
-+            # Currently, the host xfer buffer block is 1-to-1 mapped to device
-+            # kv blocks, so host blocks won't be flushed as long as its device
-+            # block is not overwritten; and it will be safe to skip saving them
-+            # to host xfer buffer.
-+            if full_block_ids:
-+                self._reqs_need_save[request.request_id] = \
-+                    (request, full_block_ids)
-+        elif params.get("do_remote_prefill"):
-             if params.get("remote_block_ids"):
-                 if all(p in params for p in ("remote_engine_id", "remote_host",
-                                              "remote_port")):
-@@ -260,6 +306,7 @@ class NixlConnectorScheduler:
-                     # Get unhashed blocks to pull from remote.
-                     self._reqs_need_recv[request.request_id] = (
-                         request, local_block_ids)
-+
-                 else:
-                     logger.warning(
-                         "Got invalid KVTransferParams: %s. This "
-@@ -284,10 +331,21 @@ class NixlConnectorScheduler:
-                 kv_transfer_params=req.kv_transfer_params,
-             )
- 
--        # Clear the list once workers start the transfers
--        self._reqs_need_recv.clear()
-+        for req_id, (req, block_ids) in self._reqs_need_save.items():
-+            assert req.kv_transfer_params is not None
-+            meta.add_new_req(
-+                request_id=req_id,
-+                local_block_ids=block_ids,
-+                kv_transfer_params=req.kv_transfer_params,
-+                load_remote_cache=False,
-+                save_to_host=True,
-+            )
- 
-         meta.reqs_to_send = self._reqs_need_send
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        thinker_config: Qwen3OmniMoeThinkerConfig = (
++            vllm_config.model_config.hf_config.thinker_config
++        )
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = thinker_config
++        self.multimodal_config = multimodal_config
 +
-+        # Clear the list once workers start the transfers
-+        self._reqs_need_recv.clear()
-+        self._reqs_need_save.clear()
-         self._reqs_need_send = {}
- 
-         return meta
-@@ -379,9 +437,36 @@ class NixlConnectorWorker:
-         self.tp_rank = get_tensor_model_parallel_rank()
-         self.world_size = get_tensor_model_parallel_world_size()
-         self.tp_group = get_tp_group()
-+        self.num_blocks = 0
- 
-         # KV Caches and nixl tracking data.
--        self.kv_caches: dict[str, torch.Tensor] = {}
-+        self.device_type = current_platform.device_type
-+        self.kv_buffer_device: str = \
-+            vllm_config.kv_transfer_config.kv_buffer_device
-+        if self.device_type not in _NIXL_SUPPORTED_XPUS:
-+            raise RuntimeError(f"{self.device_type} is not supported.")
-+        elif self.kv_buffer_device not in _NIXL_SUPPORTED_XPUS[
-+                self.device_type]:
-+            raise RuntimeError(
-+                f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-+                "is not supported.")
-+        self.device_kv_caches: dict[str, torch.Tensor] = {}
-+
-+        # cpu kv buffer for xfer
-+        # used when xPU memory can not be registered under nixl
-+        self.host_xfer_buffers: dict[str, torch.Tensor] = {}
-+        self.use_host_buffer = self.kv_buffer_device == "cpu"
-+        if self.kv_buffer_device == "cuda":
-+            self.nixl_memory_type = "VRAM"
-+        elif self.kv_buffer_device == "cpu":
-+            self.nixl_memory_type = "DRAM"
++        # force "use_flash_attention_2=True" to audio tower to align
++        # the results.
++        if flash_attn is not None:
++            audio_config = thinker_config.audio_config
++            audio_config._attn_implementation_autoset = True
++            audio_config._attn_implementation = "flash_attention_2"
 +        else:
-+            raise RuntimeError(
-+                f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-+                "is not supported.")
++            logger.warning(
++                "flash_attn is not available, the model may not yield the "
++                "exactly same result as the transformers implementation "
++                "in the audio tower part."
++            )
 +
-+        # Note: host xfer buffer ops when use_host_buffer is True
-+        self.copy_blocks: Optional[CopyBlocksOp] = None
- 
-         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
-         # rank will still only pull from a single remote TP worker.
-@@ -404,6 +489,7 @@ class NixlConnectorWorker:
- 
-         # In progress transfers.
-         # [req_id -> list[handle]]
-+        self._recving_metadata: dict[ReqId, ReqMeta] = {}
-         self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
-         # Track the expiration time of requests that are waiting to be sent.
-         self._reqs_to_send: dict[ReqId, float] = {}
-@@ -440,6 +526,7 @@ class NixlConnectorWorker:
-         self.backend_name = backend.get_name()
-         attn_backend = backend_name_to_enum(self.backend_name)
-         self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
-+        self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1
-         logger.debug("Detected attention backend %s", self.backend_name)
- 
-         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
-@@ -529,6 +616,31 @@ class NixlConnectorWorker:
-         # Remote rank -> agent name.
-         return {p_remote_rank: remote_agent_name}
- 
-+    def initialize_host_xfer_buffer(
-+            self, kv_caches: dict[str, torch.Tensor]) -> None:
-+        """
-+        Initialize transfer buffer in CPU mem for accelerators
-+        NOT directly supported by NIXL (e.g., tpu)
-+        """
-+        xfer_buffers: dict[str, torch.Tensor] = {}
-+        try:
-+            for layer_name, kv_cache in kv_caches.items():
-+                kv_shape = kv_cache.shape
-+                kv_dtype = kv_cache.dtype
-+                xfer_buffers[layer_name] = torch.empty(kv_shape,
-+                                                       dtype=kv_dtype,
-+                                                       device="cpu")
-+        except MemoryError as e:
-+            logger.error("NIXLConnectorWorker gets %s.", e)
-+            raise
-+
-+        self.host_xfer_buffers = xfer_buffers
-+
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        """Assign copy (d2h, h2d) operations when host buffer is used."""
-+        assert self.use_host_buffer
-+        self.copy_blocks = copy_operation
-+
-     def _background_nixl_handshake(self, req_id: str,
-                                    remote_engine_id: EngineId, meta: ReqMeta):
-         # Do NIXL handshake in background and add to _ready_requests when done.
-@@ -562,47 +674,76 @@ class NixlConnectorWorker:
-         _, first_kv_cache = next(iter(kv_caches.items()))
-         kv_elem_size = first_kv_cache.element_size()
- 
-+        if self.use_host_buffer:
-+            self.initialize_host_xfer_buffer(kv_caches=kv_caches)
-+            assert len(self.host_xfer_buffers) == len(kv_caches), (
-+                f"host_buffer: {len(self.host_xfer_buffers)}, "
-+                f"kv_caches: {len(kv_caches)}")
-+            xfer_buffers = self.host_xfer_buffers
-+        else:
-+            xfer_buffers = kv_caches
-+            assert not self.host_xfer_buffers, (
-+                "host_xfer_buffer should not be initialized when "
-+                f"kv_buffer_device is {self.kv_buffer_device}")
-+
-         # TODO(tms): Find a more robust way to detect and handle MLA
-         # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
-         # KV memory layout is HND, as opposed to the default NHD. Note that it
-         # will only affects the strides. For MLA instead, we make require no
-         # such thing and resort to the standard layout.
-         use_mla = len(first_kv_cache.shape) == 3
--        assert use_mla == self.use_mla
--
--        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
--        # once it goes live, as a single kv layout is expected for xfers.
--        if use_mla:
--            # MLA case.
-+        if self.device_type == "tpu":
-+            assert not use_mla, f"{self.kv_buffer_device} does not support MLA."
-+            assert self._use_pallas_v1, f"attn backend: {self.backend_name}"
-+            # tpu (v1) kv shape per layer:
-+            # (num_blocks, block_size, num_kv_heads * 2, head_size)
-             self.num_blocks = first_kv_cache.shape[0]
--            block_rank = 2  # [block_size, latent_dim]
-+            block_rank = 3  # [block_size, kv_heads, head_dim]
-             block_shape = first_kv_cache.shape[-block_rank:]
--            block_size, kv_latent_dim = block_shape
--            self.slot_size_bytes = kv_elem_size * kv_latent_dim
--        else:
--            # [2 (k and v), num_blocks, ...]
--            if self._use_flashinfer:
--                # FlashInfer swaps 2<->num_blocks dimensions.
-+            block_size, n_kv_heads_x_2, head_dim = block_shape
-+            self.slot_size_bytes = kv_elem_size * n_kv_heads_x_2 * head_dim
-+        elif self.device_type == "cuda":
-+            assert use_mla == self.use_mla
-+            # TODO (NickLucche) not compatible with hybrid allocator.
-+            # Enforce check once it goes live, as a single kv layout
-+            # is expected for xfers.
-+            if use_mla:
-+                # MLA case.
-                 self.num_blocks = first_kv_cache.shape[0]
--                block_rank = 4  # [2, block_size, kv_heads, head_dim]
-+                block_rank = 2  # [block_size, latent_dim]
-+                block_shape = first_kv_cache.shape[-block_rank:]
-+                block_size, kv_latent_dim = block_shape
-+                self.slot_size_bytes = kv_elem_size * kv_latent_dim
-             else:
--                self.num_blocks = first_kv_cache.shape[1]
--                block_rank = 3  # [block_size, kv_heads, head_dim]
--            block_shape = first_kv_cache.shape[-block_rank:]
--            block_size, n_kv_heads, head_dim = block_shape[-3:]
--            # head size in bytes.
--            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
--        assert block_size == self.block_size
-+                # [2 (k and v), num_blocks, ...]
-+                if self._use_flashinfer:
-+                    # FlashInfer swaps 2<->num_blocks dimensions.
-+                    self.num_blocks = first_kv_cache.shape[0]
-+                    block_rank = 4  # [2, block_size, kv_heads, head_dim]
-+                else:
-+                    self.num_blocks = first_kv_cache.shape[1]
-+                    block_rank = 3  # [block_size, kv_heads, head_dim]
-+                block_shape = first_kv_cache.shape[-block_rank:]
-+                block_size, n_kv_heads, head_dim = block_shape[-3:]
-+                # head size in bytes.
-+                self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-+            assert block_size == self.block_size
-+        else:
-+            raise RuntimeError(
-+                f"{self.device_type} ({self.backend_name}) is not supported.")
-+
-         # TODO(tms): self.block_len needs to be per-layer for sliding window,
-         # hybrid attn, etc
-         # block size in bytes
-         self.block_len = kv_elem_size * math.prod(block_shape)
-         logger.info(
--            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
--            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
--            self.num_blocks, block_shape, first_kv_cache.shape)
-+            "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
-+            "use_host_buffer: %s, num_blocks: %s, block_shape: %s, "
-+            "per_layer_kv_cache_shape: %s", use_mla, self.kv_buffer_device,
-+            self.use_host_buffer, self.num_blocks, block_shape,
-+            first_kv_cache.shape)
-         self.dst_num_blocks[self.engine_id] = self.num_blocks
--        self.kv_caches = kv_caches
-+        self.device_kv_caches = kv_caches
-         kv_caches_base_addr = []
-         caches_data = []
- 
-@@ -614,19 +755,21 @@ class NixlConnectorWorker:
-         # (roughly 8KB vs 5KB).
-         # Conversely for FlashInfer, K and V are transferred in the same tensor
-         # to better exploit the memory layout (ie num_blocks is the first dim).
--        for cache_or_caches in kv_caches.values():
-+        for cache_or_caches in xfer_buffers.values():
-             # Normalize to always be a list of caches
--            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
--                else cache_or_caches
-+            cache_list = [cache_or_caches] if use_mla \
-+                         or self._use_pallas_v1 or self._use_flashinfer \
-+                         else cache_or_caches
-             for cache in cache_list:
-                 base_addr = cache.data_ptr()
-                 region_len = self.num_blocks * self.block_len
--                caches_data.append(
--                    (base_addr, region_len, cache.device.index, ""))
-+                # NOTE: use tp_rank for device_id since multi-node TP
-+                # is rarely used.
-+                caches_data.append((base_addr, region_len, self.tp_rank, ""))
-                 kv_caches_base_addr.append(base_addr)
-         self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
-         self.num_regions = len(caches_data)
--        self.num_layers = len(self.kv_caches.keys())
-+        self.num_layers = len(xfer_buffers.keys())
- 
-         # TODO(mgoin): remove this once we have hybrid memory allocator
-         # Optimization for models with local attention (Llama 4)
-@@ -648,7 +791,8 @@ class NixlConnectorWorker:
-                          self.block_window_per_layer)
-             assert len(self.block_window_per_layer) == self.num_layers
- 
--        descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
-+        descs = self.nixl_wrapper.get_reg_descs(caches_data,
-+                                                self.nixl_memory_type)
-         logger.debug("Registering descs: %s", caches_data)
-         self.nixl_wrapper.register_memory(descs)
-         logger.debug("Done registering descs")
-@@ -666,11 +810,13 @@ class NixlConnectorWorker:
-                 block_offset = block_id * self.block_len
-                 addr = base_addr + block_offset
-                 # (addr, len, device id)
-+                # TODO: does device_id matter to DRAM?
-                 blocks_data.append((addr, self.block_len, self.tp_rank))
-         logger.debug("Created %s blocks for src engine %s and rank %s",
-                      len(blocks_data), self.engine_id, self.tp_rank)
- 
--        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-+                                                 self.nixl_memory_type)
-         # NIXL_INIT_AGENT to be used for preparations of local descs.
-         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-             "NIXL_INIT_AGENT", descs)
-@@ -755,6 +901,8 @@ class NixlConnectorWorker:
-         tp_ratio = divide(self._tp_size[self.engine_id],
-                           self._tp_size[engine_id])
-         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
-+        assert not self._use_pallas_v1 or tp_ratio == 1, \
-+               "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
- 
-         # Handle tp_size>num_kv_heads: replicate KV cache.
-         total_num_kv_heads = self.model_config.get_total_num_kv_heads()
-@@ -813,13 +961,43 @@ class NixlConnectorWorker:
-             self.tp_rank)
- 
-         # Register with NIXL.
--        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-+                                                 self.nixl_memory_type)
-         self.dst_xfer_side_handles[
-             engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                 remote_agent_name, descs)
- 
-         return remote_agent_name
- 
-+    def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
-+        """copy recved kv from host buffer to device."""
-+        assert self.use_host_buffer
-+        assert self.copy_blocks is not None
-+
-+        local_block_ids = meta.local_block_ids
-+        self.copy_blocks(self.host_xfer_buffers, self.device_kv_caches,
-+                         local_block_ids, local_block_ids, "h2d")
-+        if logger.isEnabledFor(logging.DEBUG):
-+            logger.debug(
-+                "synced recved kv of request[%s] to device kv buffer,"
-+                "local_block_ids: %s. ", req_id,
-+                ",".join(map(str, meta.local_block_ids)))
-+
-+    def save_kv_to_host(self, metadata: NixlConnectorMetadata):
-+        """copy kv from device to host buffer."""
-+        assert self.use_host_buffer
-+        assert self.copy_blocks is not None
-+
-+        for req_id, meta in metadata.reqs_to_save.items():
-+            if logger.isEnabledFor(logging.DEBUG):
-+                logger.debug(
-+                    "save_load_kv for request[%s] to host xfer buffer."
-+                    "local_block_ids: %s. ", req_id,
-+                    ",".join(map(str, meta.local_block_ids)))
-+            # blocking
-+            self.copy_blocks(self.device_kv_caches, self.host_xfer_buffers,
-+                             meta.local_block_ids, meta.local_block_ids, "d2h")
-+
-     def get_finished(self) -> tuple[set[str], set[str]]:
-         """
-         Get requests that are done sending or recving on this specific worker.
-@@ -834,6 +1012,12 @@ class NixlConnectorWorker:
-                 "and %s requests done recving", self.tp_rank,
-                 len(done_sending), len(done_recving))
- 
-+        if self.use_host_buffer:
-+            for req_id in done_recving:
-+                meta = self._recving_metadata.pop(req_id)
-+                assert meta, f"{req_id} not found in recving_metadata list"
-+                self.sync_recved_kv_to_device(req_id, meta)
-+
-         # Handle timeout to avoid stranding blocks on remote.
-         now = time.perf_counter()
-         while self._reqs_to_send:
-@@ -904,6 +1088,8 @@ class NixlConnectorWorker:
-                 "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
-                 remote_engine_id, len(meta.local_block_ids),
-                 len(meta.remote_block_ids))
-+            if self.use_host_buffer:
-+                self._recving_metadata[req_id] = meta
-             if remote_engine_id not in self._remote_agents:
-                 # Initiate handshake with remote engine to exchange metadata.
-                 with self._handshake_lock:
-diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
-index 1f7a14920..46d20e91d 100644
---- a/vllm/distributed/parallel_state.py
-+++ b/vllm/distributed/parallel_state.py
-@@ -44,6 +44,7 @@ from vllm.distributed.utils import StatelessProcessGroup
- from vllm.logger import init_logger
- from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
-                         resolve_obj_by_qualname, supports_custom_op)
-+from vllm.envs import CCL_P2P_CPU
- 
- 
- @dataclass
-@@ -690,6 +691,8 @@ class GroupCoordinator:
-                     and tensor.numel() % all_gather_size == 0):
-                 tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
- 
-+            if envs.CCL_P2P_CPU:
-+                tensor = tensor.cpu()
-             if tensor.is_cpu:
-                 # use metadata_group for CPU tensors
-                 torch.distributed.send(tensor,
-@@ -734,9 +737,12 @@ class GroupCoordinator:
-         tensor_dict: dict[str, Any] = {}
-         for key, value in recv_metadata_list:
-             if isinstance(value, TensorMetadata):
-+                tensor_device = value.device
-+                if envs.CCL_P2P_CPU:
-+                    tensor_device = 'cpu'
-                 tensor = torch.empty(value.size,
-                                      dtype=value.dtype,
--                                     device=value.device)
-+                                     device=tensor_device)
-                 if tensor.numel() == 0:
-                     # Skip broadcasting empty tensors.
-                     tensor_dict[key] = tensor
-@@ -761,6 +767,8 @@ class GroupCoordinator:
-                     torch.distributed.recv(tensor,
-                                            src=self.ranks[src],
-                                            group=group)
-+                if envs.CCL_P2P_CPU:
-+                    tensor = tensor.to(value.device)
-                 if use_all_gather:
-                     # do the allgather
-                     tensor = all_gather_group.all_gather(  # type: ignore
-diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
-index aec75f826..6b15c9f95 100644
---- a/vllm/engine/arg_utils.py
-+++ b/vllm/engine/arg_utils.py
-@@ -1651,7 +1651,8 @@ class EngineArgs:
- 
-         if (self.max_num_seqs is None
-                 and usage_context in default_max_num_seqs):
--            self.max_num_seqs = default_max_num_seqs[usage_context]
-+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
-+                                    self.max_num_batched_tokens or sys.maxsize)
- 
-             logger.debug("Setting max_num_seqs to %d for %s usage context.",
-                          self.max_num_seqs, use_context_value)
-diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
-index 2f766a2da..680733966 100644
---- a/vllm/entrypoints/llm.py
-+++ b/vllm/entrypoints/llm.py
-@@ -239,7 +239,7 @@ class LLM:
-                 compilation_config_instance = compilation_config
-         else:
-             compilation_config_instance = CompilationConfig()
--
-+        kwargs.pop("device", None)
-         engine_args = EngineArgs(
-             model=model,
-             task=task,
-@@ -1230,6 +1230,44 @@ class LLM:
- 
-         return [ClassificationRequestOutput.from_base(item) for item in items]
- 
-+    def reward(
-+        self,
-+        prompts: Union[PromptType, Sequence[PromptType]],
-+        /,
-+        *,
-+        truncate_prompt_tokens: Optional[int] = None,
-+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-+        pooling_params: Optional[Union[PoolingParams,
-+                                       Sequence[PoolingParams]]] = None,
-+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-+    ) -> list[PoolingRequestOutput]:
-+        """
-+        Generate rewards for each prompt.
-+        Args:
-+            prompts: The prompts to the LLM. You may pass a sequence of prompts
-+                for batch inference. See [PromptType][vllm.inputs.PromptType]
-+                for more details about the format of each prompts.
-+            use_tqdm: If `True`, shows a tqdm progress bar.
-+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
-+                it is used to create the progress bar.
-+                If `False`, no progress bar is created.
-+            lora_request: LoRA request to use for generation, if any.
-+            pooling_params: The pooling parameters for pooling. If None, we
-+                use the default pooling parameters.
-+        Returns:
-+            A list of `PoolingRequestOutput` objects containing the
-+            pooled hidden states in the same order as the input prompts.
-+        """
++        self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config)
 +
-+        return self.encode(
-+            prompts,
-+            use_tqdm=use_tqdm,
-+            lora_request=lora_request,
-+            pooling_params=pooling_params,
-+            truncate_prompt_tokens=truncate_prompt_tokens,
-+            pooling_task="encode",
++        self.visual = Qwen3Omni_VisionTransformer(
++            vision_config=thinker_config.vision_config,
++            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "visual"),
 +        )
++        self.quant_config = quant_config
 +
-     def _embedding_score(
-         self,
-         tokenizer: AnyTokenizer,
-diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
-index 88c8aa929..10b9c63e0 100644
---- a/vllm/entrypoints/openai/tool_parsers/__init__.py
-+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
-@@ -18,6 +18,8 @@ from .mistral_tool_parser import MistralToolParser
- from .phi4mini_tool_parser import Phi4MiniJsonToolParser
- from .pythonic_tool_parser import PythonicToolParser
- from .qwen3coder_tool_parser import Qwen3CoderToolParser
-+from .seed_oss_tool_parser import SeedOssToolParser
-+# from .step3_tool_parser import Step3ToolParser
- from .xlam_tool_parser import xLAMToolParser
- 
- __all__ = [
-@@ -40,4 +42,6 @@ __all__ = [
-     "HunyuanA13BToolParser",
-     "Glm4MoeModelToolParser",
-     "Qwen3CoderToolParser",
-+    "SeedOssToolParser",
-+    # "Step3ToolParser",
- ]
-diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
-new file mode 100644
-index 000000000..69cf2e68f
---- /dev/null
-+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
-@@ -0,0 +1,676 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+# Adapted from qwen3coder xml parser, All rights reserved.
-+# ruff: noqa: E501
-+
-+import ast
-+import json
-+import uuid
-+from collections.abc import Sequence
-+from typing import Any, Optional, Union
-+
-+import regex as re
-+
-+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-+                                              ChatCompletionToolsParam,
-+                                              DeltaFunctionCall, DeltaMessage,
-+                                              DeltaToolCall,
-+                                              ExtractedToolCallInformation,
-+                                              FunctionCall, ToolCall)
-+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-+    ToolParser, ToolParserManager)
-+from vllm.logger import init_logger
-+from vllm.transformers_utils.tokenizer import AnyTokenizer
-+
-+logger = init_logger(__name__)
++        self.language_model = Qwen3MoeLLMForCausalLM(
++            vllm_config=vllm_config.with_hf_config(
++                thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]
++            ),
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
 +
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors
++        )
 +
-+@ToolParserManager.register_module("seed_oss")
-+class SeedOssToolParser(ToolParser):
-+    TOOL_CALL_START = "<seed:tool_call>"
-+    TOOL_CALL_END = "</seed:tool_call>"
-+
-+    def __init__(self, tokenizer: AnyTokenizer):
-+        super().__init__(tokenizer)
-+
-+        # --- streaming state ---
-+        self._reset_streaming_state()
-+        self.prev_tool_call_arr: list[dict] = []
-+
-+        self.tool_call_start_token: str = self.TOOL_CALL_START
-+        self.tool_call_end_token: str = self.TOOL_CALL_END
-+        # Sentinel tokens for streaming mode
-+        self.tool_call_prefix: str = "<function="
-+        self.function_end_token: str = "</function>"
-+        self.parameter_prefix: str = "<parameter="
-+        self.parameter_end_token: str = "</parameter>"
-+        self.think_start_token: str = "<seed:think>"
-+        self.think_end_token: str = "</seed:think>"
-+        self.is_tool_call_started: bool = False
-+        self.is_thinking_end: bool = False
-+        self.failed_count: int = 0
-+        self._reset_streaming_state()
-+
-+        self.tool_call_start_token_id = self.vocab.get(
-+            self.tool_call_start_token)
-+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
-+        self.think_end_token_id = self.vocab.get(self.think_end_token)
-+
-+        if (self.tool_call_start_token_id is None
-+                or self.tool_call_end_token_id is None):
-+            raise RuntimeError(
-+                "Seed_Oss XML parser: tokenizer did not include "
-+                "<seed:tool_call> or its closing tag.")
-+
-+        tool_start_re = re.escape(self.tool_call_start_token)
-+        tool_end_re = re.escape(self.tool_call_end_token)
-+
-+        self.tool_call_complete_regex = re.compile(
-+            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
-+        self.tool_call_regex = re.compile(
-+            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
-+            re.DOTALL)
-+
-+        self.tool_call_function_regex = re.compile(
-+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
-+        self.tool_call_parameter_regex = re.compile(
-+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
-+
-+        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
-+                    self.__class__.__name__)
-+
-+    def _generate_tool_call_id(self) -> str:
-+        """Generate a unique tool call ID."""
-+        return f"call_{uuid.uuid4().hex[:24]}"
-+
-+    def _reset_streaming_state(self):
-+        """Reset all streaming state."""
-+        self.current_tool_index = 0
-+        self.is_tool_call_started = False
-+        self.header_sent = False
-+        self.current_tool_id = -1
-+        self.current_function_name = None
-+        self.current_param_name = None
-+        self.current_param_value = ""
-+        self.param_count = 0
-+        self.in_param = False
-+        self.in_function = False
-+        self.accumulated_text = ""
-+        self.json_started = False
-+        self.json_closed = False
-+
-+    def _parse_xml_function_call(
-+            self, function_call_str: str,
-+            tools: Optional[list[ChatCompletionToolsParam]]
-+    ) -> Optional[ToolCall]:
-+
-+        def get_arguments_config(func_name: str) -> dict:
-+            if tools is None:
-+                return {}
-+            for config in tools:
-+                if not hasattr(config, "type") or not (
-+                        hasattr(config, "function")
-+                        and hasattr(config.function, "name")):
-+                    continue
-+                if (config.type == "function"
-+                        and config.function.name == func_name):
-+                    if not hasattr(config.function, "parameters"):
-+                        return {}
-+                    params = config.function.parameters
-+                    if isinstance(params, dict) and "properties" in params:
-+                        return params["properties"]
-+                    elif isinstance(params, dict):
-+                        return params
-+                    else:
-+                        return {}
-+            logger.warning("Tool '%s' is not defined in the tools list.",
-+                           func_name)
-+            return {}
-+
-+        def convert_param_value(param_value: str, param_name: str,
-+                                param_config: dict, func_name: str) -> Any:
-+            # Handle null value for any type
-+            if param_value.lower() == "null":
-+                return None
-+
-+            if param_name not in param_config:
-+                if param_config != {}:
-+                    logger.warning(
-+                        "Parsed parameter '%s' is not defined in "
-+                        "the tool parameters for tool '%s', "
-+                        "directly returning the string value.", param_name,
-+                        func_name)
-+                return param_value
-+
-+            if (isinstance(param_config[param_name], dict)
-+                    and "type" in param_config[param_name]):
-+                param_type = str(
-+                    param_config[param_name]["type"]).strip().lower()
-+            else:
-+                param_type = "string"
-+            if param_type in [
-+                    "string", "str", "text", "varchar", "char", "enum"
-+            ]:
-+                return param_value
-+            elif (param_type.startswith("int") or param_type.startswith("uint")
-+                  or param_type.startswith("long")
-+                  or param_type.startswith("short")
-+                  or param_type.startswith("unsigned")):
-+                try:
-+                    param_value = int(param_value)  # type: ignore
-+                except (ValueError, TypeError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
-+                        "'%s', degenerating to string.", param_value,
-+                        param_name, func_name)
-+                return param_value
-+            elif param_type.startswith("num") or param_type.startswith(
-+                    "float"):
-+                try:
-+                    float_param_value = float(param_value)
-+                    param_value = float_param_value if float_param_value - int(
-+                        float_param_value) != 0 else int(
-+                            float_param_value)  # type: ignore
-+                except (ValueError, TypeError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not a float in tool "
-+                        "'%s', degenerating to string.", param_value,
-+                        param_name, func_name)
-+                return param_value
-+            elif param_type in ["boolean", "bool", "binary"]:
-+                param_value = param_value.lower()
-+                if param_value not in ["true", "false"]:
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not a boolean "
-+                        "(`true` of `false`) in tool '%s', degenerating to false.",
-+                        param_value, param_name, func_name)
-+                return param_value == "true"
-+            else:
-+                if param_type == "object" or param_type.startswith("dict"):
-+                    try:
-+                        param_value = json.loads(param_value)
-+                        return param_value
-+                    except (ValueError, TypeError, json.JSONDecodeError):
-+                        logger.warning(
-+                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
-+                            "object in tool '%s', will try other methods to parse it.",
-+                            param_value, param_name, func_name)
-+                try:
-+                    param_value = ast.literal_eval(param_value)
-+                except (ValueError, SyntaxError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' cannot be converted via "
-+                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
-+                        param_value, param_name, func_name)
-+                return param_value
-+
-+        # Extract function name
-+        end_index = function_call_str.index(">")
-+        function_name = function_call_str[:end_index]
-+        param_config = get_arguments_config(function_name)
-+        parameters = function_call_str[end_index + 1:]
-+        param_dict = {}
-+        for match in self.tool_call_parameter_regex.findall(parameters):
-+            match_text = match[0] if match[0] else match[1]
-+            idx = match_text.index(">")
-+            param_name = match_text[:idx]
-+            param_value = str(match_text[idx + 1:])
-+            # Remove prefix and trailing \n
-+            if param_value.startswith("\n"):
-+                param_value = param_value[1:]
-+            if param_value.endswith("\n"):
-+                param_value = param_value[:-1]
-+
-+            param_dict[param_name] = convert_param_value(
-+                param_value, param_name, param_config, function_name)
-+        return ToolCall(
-+            type="function",
-+            function=FunctionCall(name=function_name,
-+                                  arguments=json.dumps(param_dict,
-+                                                       ensure_ascii=False)),
++        self.use_deepstack = hasattr(
++            thinker_config.vision_config, "deepstack_visual_indexes"
++        )
++        self.deepstack_num_level = (
++            len(thinker_config.vision_config.deepstack_visual_indexes)
++            if self.use_deepstack
++            else 0
++        )
++        # register buffer for deepstack
++        self.deepstack_input_embeds = (
++            [
++                torch.zeros(
++                    vllm_config.scheduler_config.max_num_batched_tokens,
++                    thinker_config.text_config.hidden_size,
++                )
++                for _ in range(self.deepstack_num_level)
++            ]
++            if self.use_deepstack
++            else None
++        )
++        self.visual_dim = thinker_config.vision_config.out_hidden_size
++        self.multiscale_dim = self.visual_dim * self.deepstack_num_level
++
++    def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
++        # get deepstack_input_embeds from buffer, and clear the buffer
++        return IntermediateTensors(
++            {
++                f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][
++                    :num_tokens
++                ]
++                for idx in range(self.deepstack_num_level)
++            }
 +        )
 +
-+    def _get_function_calls(self, model_output: str) -> list[str]:
-+        # Find all tool calls
-+        matched_ranges = self.tool_call_regex.findall(model_output)
-+        raw_tool_calls = [
-+            match[0] if match[0] else match[1] for match in matched_ranges
-+        ]
++    def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None:
++        # set deepstack_input_embeds to buffer
++        num_tokens = deepstack_input_embeds.size(1)
++        if num_tokens > self.deepstack_input_embeds[0].size(0):
++            self.deepstack_input_embeds = [
++                torch.zeros(
++                    num_tokens,
++                    self.config.text_config.hidden_size,
++                    device=self.deepstack_input_embeds[0].device,
++                    dtype=self.deepstack_input_embeds[0].dtype,
++                )
++                for _ in range(self.deepstack_num_level)
++            ]
++        for idx in range(self.deepstack_num_level):
++            self.deepstack_input_embeds[idx][:num_tokens].copy_(
++                deepstack_input_embeds[idx]
++            )
 +
-+        # Back-off strategy if no tool_call tags found
-+        if len(raw_tool_calls) == 0:
-+            raw_tool_calls = [model_output]
++    def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
++        # clear deepstack_input_embeds in buffer
++        if num_tokens > 0:
++            for idx in range(self.deepstack_num_level):
++                self.deepstack_input_embeds[idx][:num_tokens].zero_()
 +
-+        raw_function_calls = []
-+        for tool_call in raw_tool_calls:
-+            raw_function_calls.extend(
-+                self.tool_call_function_regex.findall(tool_call))
++    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
++        mm_input_by_modality = {}
 +
-+        function_calls = [
-+            match[0] if match[0] else match[1] for match in raw_function_calls
-+        ]
-+        return function_calls
++        # Preserve the order of modalities if there are multiple of them
++        # from the order of kwargs.
++        for input_key in kwargs:
++            if (
++                input_key in ("pixel_values", "image_embeds")
++                and "image" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
++                    **kwargs
++                )
++            if (
++                input_key in ("pixel_values_videos", "video_embeds")
++                and "video" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
++                    **kwargs
++                )
++            if (
++                input_key in ("input_audio_features")
++                and "audio" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(
++                    **kwargs
++                )
++        return mm_input_by_modality
 +
-+    def extract_tool_calls(
-+        self,
-+        model_output: str,
-+        request: ChatCompletionRequest,
-+    ) -> ExtractedToolCallInformation:
-+        # Quick check to avoid unnecessary processing
-+        if self.tool_call_prefix not in model_output:
-+            return ExtractedToolCallInformation(tools_called=False,
-+                                                tool_calls=[],
-+                                                content=model_output)
-+
-+        # Check if both think start and end tokens are present
-+        if (self.think_start_token in model_output
-+                and self.think_end_token in model_output):
-+            # Find the position of think end token
-+            think_end_index = model_output.find(self.think_end_token) + len(
-+                self.think_end_token)
-+            # Extract content after think end token
-+            result_content = model_output[think_end_index:]
-+            thinking_content = model_output[:think_end_index]
-+
-+        try:
-+            function_calls = self._get_function_calls(result_content)
-+            if len(function_calls) == 0:
-+                return ExtractedToolCallInformation(tools_called=False,
-+                                                    tool_calls=[],
-+                                                    content=model_output)
-+
-+            tool_calls = [
-+                self._parse_xml_function_call(function_call_str, request.tools)
-+                for function_call_str in function_calls
-+            ]
++    def get_language_model(self) -> torch.nn.Module:
++        return self.language_model
 +
-+            # Populate prev_tool_call_arr for serving layer to set finish_reason
-+            self.prev_tool_call_arr.clear()  # Clear previous calls
-+            for tool_call in tool_calls:
-+                if tool_call:
-+                    self.prev_tool_call_arr.append({
-+                        "name":
-+                        tool_call.function.name,
-+                        "arguments":
-+                        tool_call.function.arguments,
-+                    })
-+
-+            # Extract content before tool calls
-+            tool_call_start_index = result_content.find(
-+                self.tool_call_start_token)
-+            tool_call_start_index = (
-+                tool_call_start_index if tool_call_start_index >= 0 else
-+                result_content.find(self.tool_call_prefix))
-+            content = thinking_content + result_content[:tool_call_start_index]
-+
-+            return ExtractedToolCallInformation(
-+                tools_called=(len(tool_calls) > 0),
-+                tool_calls=tool_calls,
-+                content=content if content else None,
-+            )
++    def get_multimodal_embeddings(
++        self, **kwargs: object
++    ) -> Optional[MultiModalEmbeddings]:
++        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
++        if not mm_input_by_modality:
++            return []
 +
-+        except Exception:
-+            logger.exception("Error in extracting tool call from response.")
-+            return ExtractedToolCallInformation(tools_called=False,
-+                                                tool_calls=[],
-+                                                content=model_output)
++        # The result multimodal_embeddings is tuple of tensors, with each
++        # tensor correspoending to a multimodal data item (image or video).
++        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
++
++        # NOTE: It is important to iterate over the keys in this dictionary
++        # to preserve the order of the modalities.
++        for modality in mm_input_by_modality:
++            multimodal_input = mm_input_by_modality[modality]
++            if modality == "image":
++                vision_embeddings = self._process_image_input(multimodal_input)
++                multimodal_embeddings += vision_embeddings
++            if modality == "video":
++                video_embeddings = self._process_video_input(multimodal_input)
++                multimodal_embeddings += video_embeddings
++            if modality == "audio":
++                audio_embeddings = self._process_audio_input(multimodal_input)
++                multimodal_embeddings += audio_embeddings
++        return multimodal_embeddings
 +
-+    def extract_tool_calls_streaming(
++    def get_input_embeddings(
 +        self,
-+        previous_text: str,
-+        current_text: str,
-+        delta_text: str,
-+        previous_token_ids: Sequence[int],
-+        current_token_ids: Sequence[int],
-+        delta_token_ids: Sequence[int],
-+        request: ChatCompletionRequest,
-+    ) -> Union[DeltaMessage, None]:
-+        # If no delta text, return None unless
-+        # it's an EOS token after tool calls
-+        if not delta_text:
-+            # Check if this is an EOS token after all tool calls are complete
-+            # We check for tool calls in the text even if is_tool_call_started
-+            # is False because it might have been reset after processing all tools
-+            if (delta_token_ids
-+                    and self.tool_call_end_token_id not in delta_token_ids):
-+                # Count complete tool calls
-+                complete_calls = len(
-+                    self.tool_call_complete_regex.findall(current_text))
-+
-+                # If we have completed tool calls and populated prev_tool_call_arr
-+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
-+                    # Check if all tool calls are closed
-+                    open_calls = current_text.count(
-+                        self.tool_call_start_token) - current_text.count(
-+                            self.tool_call_end_token)
-+                    if open_calls == 0:
-+                        # Return empty delta message to allow finish_reason processing
-+                        return DeltaMessage(content="")
-+                elif not self.is_tool_call_started and current_text:
-+                    # This is a regular content response that's now complete
-+                    return DeltaMessage(content="")
-+            return None
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
++        *,
++        is_multimodal: Optional[torch.Tensor] = None,
++        handle_oov_mm_token: bool = False,
++    ) -> torch.Tensor:
++        # inputs_embeds = self._get_text_embeddings(
++        #     input_ids,
++        #     self.language_model.get_input_embeddings,
++        #     is_multimodal=is_multimodal,
++        #     handle_oov_mm_token=handle_oov_mm_token,
++        # )
++
++        # if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
++        #     return inputs_embeds
++
++        # deepstack_input_embeds = None
++        # # TODO (ywang96): support overlapping modalitiy embeddings so that
++        # # `use_audio_in_video` will work on V1.
++        # # split the feat dim to obtain multi-scale visual feature
++        # has_vision_embeddings = [
++        #     embeddings.shape[-1] != self.config.text_config.hidden_size
++        #     for embeddings in multimodal_embeddings
++        # ]
++        # if self.visual.deepstack_visual_indexes is not None and any(
++        #     has_vision_embeddings
++        # ):
++        #     multiscale_len = len(self.visual.deepstack_visual_indexes)
++        #     multimodal_embeddings_multiscale = []
++        #     is_vision = torch.zeros_like(is_multimodal)
++        #     mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
++        #     mm_position_idx = 0
++        #     for index, embeddings in enumerate(multimodal_embeddings):
++        #         num_tokens = embeddings.shape[0]
++        #         current_positions = mm_positions[
++        #             mm_position_idx : mm_position_idx + num_tokens
++        #         ]
++
++        #         # Vision embeddings
++        #         if embeddings.shape[-1] != self.config.text_config.hidden_size:
++        #             visual_dim = embeddings.shape[-1] // (multiscale_len + 1)
++        #             multi_dim = visual_dim * multiscale_len
++        #             embeddings_main, embeddings_multiscale = torch.split(
++        #                 embeddings, [visual_dim, multi_dim], dim=-1
++        #             )
++        #             multimodal_embeddings[index] = embeddings_main
++        #             multimodal_embeddings_multiscale.append(embeddings_multiscale)
++        #             is_vision[current_positions] = True
++
++        #         # Audio embeddings
++        #         else:
++        #             is_vision[current_positions] = False
++
++        #         mm_position_idx += num_tokens
++
++        #     deepstack_input_embeds = inputs_embeds.new_zeros(
++        #         inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
++        #     )
++        #     deepstack_input_embeds = _merge_multimodal_embeddings(
++        #         inputs_embeds=deepstack_input_embeds,
++        #         multimodal_embeddings=multimodal_embeddings_multiscale,
++        #         is_multimodal=is_vision,
++        #     )
++        #     deepstack_input_embeds = (
++        #         deepstack_input_embeds.view(
++        #             inputs_embeds.shape[0], multiscale_len, visual_dim
++        #         )
++        #         .permute(1, 0, 2)
++        #         .contiguous()
++        #     )
++        #     self._set_deepstack_input_embeds(deepstack_input_embeds)
++
++        # inputs_embeds = _merge_multimodal_embeddings(
++        #     inputs_embeds=inputs_embeds,
++        #     multimodal_embeddings=multimodal_embeddings,
++        #     is_multimodal=is_multimodal,
++        # )
++
++        # return inputs_embeds
++        # inputs_embeds = self._get_text_embeddings(
++        #     input_ids,
++        #     self.language_model.get_input_embeddings,
++        #     is_multimodal=is_multimodal,
++        #     handle_oov_mm_token=handle_oov_mm_token,
++        # )
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 +
-+        # Check if this is the first call (reset state if needed)
-+        if not previous_text:
-+            self._reset_streaming_state()
-+
-+        # Update accumulated text
-+        self.accumulated_text = current_text
-+
-+        # Check if we need to advance to next tool
-+        if self.json_closed and not self.in_function:
-+            # Check if this tool call has ended
-+            tool_ends = current_text.count(self.tool_call_end_token)
-+            if tool_ends > self.current_tool_index:
-+                # This tool has ended, advance to next
-+                self.current_tool_index += 1
-+                self.header_sent = False
-+                self.param_count = 0
-+                self.json_started = False
-+                self.json_closed = False
-+
-+                # Check if there are more tool calls
-+                if self.current_tool_index >= current_text.count(
-+                        self.tool_call_start_token):
-+                    # No more tool calls
-+                    self.is_tool_call_started = False
-+                # Continue processing next tool
-+                return None
-+
-+        # Check if end thinking
-+        if (not self.is_thinking_end
-+                and (self.think_end_token_id in delta_token_ids
-+                     or self.think_end_token in delta_text)):
-+            self.is_thinking_end = True
-+
-+        # If thinking hasn't ended yet, don't process any tool calls
-+        if not self.is_thinking_end:
-+            return DeltaMessage(content=delta_text)
-+
-+        # Handle normal content before tool calls
-+        if not self.is_tool_call_started:
-+            # Check if tool call is starting
-+            if (self.tool_call_start_token_id in delta_token_ids
-+                    or self.tool_call_start_token in delta_text):
-+                self.is_tool_call_started = True
-+                # Return any content before the tool call
-+                if self.tool_call_start_token in delta_text:
-+                    content_before = delta_text[:delta_text.index(
-+                        self.tool_call_start_token)]
-+                    if content_before:
-+                        return DeltaMessage(content=content_before)
-+                return None
-+            else:
-+                # Check if we're between tool calls - skip whitespace
-+                if (current_text.rstrip().endswith(self.tool_call_end_token)
-+                        and delta_text.strip() == ""):
-+                    # We just ended a tool call, skip whitespace
-+                    return None
-+                # Normal content, no tool call
-+                return DeltaMessage(content=delta_text)
-+
-+        # Check if we're between tool calls (waiting for next one)
-+        # Count tool calls we've seen vs processed
-+        tool_starts_count = current_text.count(self.tool_call_start_token)
-+        if self.current_tool_index >= tool_starts_count:
-+            # We're past all tool calls, shouldn't be here
-+            return None
++        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
++            return inputs_embeds
 +
-+        # We're in a tool call, find the current tool call portion
-+        # Need to find the correct tool call based on current_tool_index
-+        # Only process tool calls after think_end_token
-+        think_end_index = current_text.find(self.think_end_token) + len(
-+            self.think_end_token
-+        ) if self.think_end_token in current_text else 0
-+        tool_starts: list[int] = []
-+        idx = think_end_index
-+        while True:
-+            idx = current_text.find(self.tool_call_start_token, idx)
-+            if idx == -1:
-+                break
-+            tool_starts.append(idx)
-+            idx += len(self.tool_call_start_token)
++        deepstack_input_embeds = None
++        # TODO (ywang96): support overlapping modalitiy embeddings so that
++        # `use_audio_in_video` will work on V1.
++        # split the feat dim to obtain multi-scale visual feature
++        has_vision_embeddings = [
++            embeddings.shape[-1] != self.config.text_config.hidden_size
++            for embeddings in multimodal_embeddings
++        ]
++        if self.visual.deepstack_visual_indexes is not None and any(
++            has_vision_embeddings
++        ):
++            # Handle vision embeddings...
++            multiscale_len = len(self.visual.deepstack_visual_indexes)
++            multimodal_embeddings_multiscale = []
++            for index, embeddings in enumerate(multimodal_embeddings):
++                # Vision embeddings
++                if embeddings.shape[-1] != self.config.text_config.hidden_size:
++                    visual_dim = embeddings.shape[-1] // (multiscale_len + 1)
++                    multi_dim = visual_dim * multiscale_len
++                    embeddings_main, embeddings_multiscale = torch.split(
++                        embeddings, [visual_dim, multi_dim], dim=-1
++                    )
++                    # Split it into two different scales, multimodal_embeddings + multimodal_embeddings_multiscale
++                    multimodal_embeddings[index] = embeddings_main
++                    multimodal_embeddings_multiscale.append(embeddings_multiscale)
 +
-+        if self.current_tool_index >= len(tool_starts):
-+            # No more tool calls to process yet
-+            return None
++            # multiscale_len x size
++            deepstack_input_embeds = inputs_embeds.new_zeros(
++                inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
++            )
++            deepstack_input_embeds = merge_multimodal_embeddings(
++                input_ids,
++                deepstack_input_embeds,
++                multimodal_embeddings_multiscale,
++                placeholder_token_id=[self.config.image_token_id, self.config.video_token_id],
++            )
++            deepstack_input_embeds = (
++                deepstack_input_embeds.view(
++                    inputs_embeds.shape[0], multiscale_len, visual_dim
++                )
++                .permute(1, 0, 2)
++                .contiguous()
++            )
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
 +
-+        tool_start_idx = tool_starts[self.current_tool_index]
-+        # Find where this tool call ends (or current position if not ended yet)
-+        tool_end_idx = current_text.find(self.tool_call_end_token,
-+                                         tool_start_idx)
-+        if tool_end_idx == -1:
-+            tool_text = current_text[tool_start_idx:]
-+        else:
-+            tool_text = current_text[tool_start_idx:tool_end_idx +
-+                                     len(self.tool_call_end_token)]
-+
-+        # Looking for function header
-+        if not self.header_sent:
-+            if self.tool_call_prefix in tool_text:
-+                func_start = tool_text.find(self.tool_call_prefix) + len(
-+                    self.tool_call_prefix)
-+                func_end = tool_text.find(">", func_start)
-+
-+                if func_end != -1:
-+                    # Found complete function name
-+                    self.current_function_name = tool_text[func_start:func_end]
-+                    self.current_tool_id = self._generate_tool_call_id(
-+                    )  # type: ignore
-+                    self.header_sent = True
-+                    self.in_function = True
-+
-+                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
-+                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
-+                    already_added = any(
-+                        tool.get("name") == self.current_function_name
-+                        for tool in self.prev_tool_call_arr)
-+                    if not already_added:
-+                        self.prev_tool_call_arr.append({
-+                            "name": self.current_function_name,
-+                            "arguments":
-+                            "{}",  # Placeholder, will be updated later
-+                        })
-+
-+                    # Send header with function info
-+                    return DeltaMessage(tool_calls=[
-+                        DeltaToolCall(
-+                            index=self.current_tool_index,
-+                            id=self.current_tool_id,
-+                            function=DeltaFunctionCall(
-+                                name=self.current_function_name, arguments=""),
-+                            type="function",
-+                        )
-+                    ])
-+            return None
 +
-+        # We've sent header, now handle function body
-+        if self.in_function:
-+            # Send opening brace if not sent yet
-+            if (not self.json_started
-+                    and self.parameter_prefix not in delta_text):
-+                self.json_started = True
-+                return DeltaMessage(tool_calls=[
-+                    DeltaToolCall(
-+                        index=self.current_tool_index,
-+                        function=DeltaFunctionCall(arguments="{"),
-+                    )
-+                ])
-+
-+            # Make sure json_started is set if we're processing parameters
-+            if not self.json_started:
-+                self.json_started = True
-+
-+            # Check for function end in accumulated text
-+            if not self.json_closed and self.function_end_token in tool_text:
-+                # Close JSON
-+                self.json_closed = True
-+
-+                # Extract the complete tool call to update prev_tool_call_arr with final arguments
-+                # Find the function content
-+                func_start = tool_text.find(self.tool_call_prefix) + len(
-+                    self.tool_call_prefix)
-+                func_content_end = tool_text.find(self.function_end_token,
-+                                                  func_start)
-+                if func_content_end != -1:
-+                    func_content = tool_text[func_start:func_content_end]
-+                    # Parse to get the complete arguments
-+                    try:
-+                        parsed_tool = self._parse_xml_function_call(
-+                            func_content, request.tools if request else None)
-+                        if parsed_tool:
-+                            # Update existing entry in prev_tool_call_arr with complete arguments
-+                            for i, tool in enumerate(self.prev_tool_call_arr):
-+                                if tool.get(
-+                                        "name") == parsed_tool.function.name:
-+                                    self.prev_tool_call_arr[i]["arguments"] = (
-+                                        parsed_tool.function.arguments)
-+                                    break
-+                    except Exception:
-+                        logger.warning(
-+                            "Failed to parse tool arguments during streaming.",
-+                            exc_info=True)
-+
-+                result = DeltaMessage(tool_calls=[
-+                    DeltaToolCall(
-+                        index=self.current_tool_index,
-+                        function=DeltaFunctionCall(arguments="}"),
-+                    )
-+                ])
-+
-+                # Reset state for next tool
-+                self.in_function = False
-+                self.json_closed = True
-+
-+                return result
-+
-+            # Look for parameters
-+            # Count how many complete parameters we have processed
-+            complete_params = tool_text.count(self.parameter_end_token)
-+
-+            # Check if we should start a new parameter
-+            if not self.in_param and self.param_count < complete_params:
-+                # Find the unprocessed parameter
-+                # Count parameter starts
-+                param_starts = []
-+                idx = 0
-+                while True:
-+                    idx = tool_text.find(self.parameter_prefix, idx)
-+                    if idx == -1:
-+                        break
-+                    param_starts.append(idx)
-+                    idx += len(self.parameter_prefix)
-+
-+                if len(param_starts) > self.param_count:
-+                    # Process the next parameter
-+                    param_idx = param_starts[self.param_count]
-+                    param_start = param_idx + len(self.parameter_prefix)
-+                    remaining = tool_text[param_start:]
-+
-+                    if ">" in remaining:
-+                        # We have the complete parameter name
-+                        name_end = remaining.find(">")
-+                        self.current_param_name = remaining[:name_end]
-+
-+                        # Find the parameter value
-+                        value_start = param_start + name_end + 1
-+                        value_text = tool_text[value_start:]
-+                        if value_text.startswith("\n"):
-+                            value_text = value_text[1:]
-+
-+                        # Find where this parameter ends
-+                        param_end_idx = value_text.find(
-+                            self.parameter_end_token)
-+                        if param_end_idx != -1:
-+                            # Complete parameter found
-+                            param_value = value_text[:param_end_idx]
-+                            if param_value.endswith("\n"):
-+                                param_value = param_value[:-1]
-+
-+                            # Build complete JSON fragment for this parameter
-+                            if self.param_count == 0:
-+                                json_fragment = (
-+                                    '"' + self.current_param_name + '": "' +
-+                                    json.dumps(param_value)[1:-1] + '"')
-+                            else:
-+                                json_fragment = (
-+                                    ', "' + self.current_param_name + '": "' +
-+                                    json.dumps(param_value)[1:-1] + '"')
-+
-+                            self.param_count += 1
-+
-+                            return DeltaMessage(tool_calls=[
-+                                DeltaToolCall(
-+                                    index=self.current_tool_index,
-+                                    function=DeltaFunctionCall(
-+                                        arguments=json_fragment),
-+                                )
-+                            ])
-+
-+            # Continue parameter value
-+            if self.in_param:
-+                if self.parameter_end_token in delta_text:
-+                    # End of parameter
-+                    end_idx = delta_text.find(self.parameter_end_token)
-+                    value_chunk = delta_text[:end_idx]
-+
-+                    # Skip past > if at start
-+                    if not self.current_param_value and ">" in value_chunk:
-+                        gt_idx = value_chunk.find(">")
-+                        value_chunk = value_chunk[gt_idx + 1:]
-+
-+                    if not self.current_param_value and value_chunk.startswith(
-+                            "\n"):
-+                        value_chunk = value_chunk[1:]
-+
-+                    # Calculate incremental JSON
-+                    full_value = self.current_param_value + value_chunk
-+                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-+                                    if self.current_param_value else "")
-+                    full_escaped = json.dumps(full_value)[1:-1]
-+                    delta_escaped = full_escaped[len(prev_escaped):]
-+
-+                    self.in_param = False
-+                    self.current_param_value = ""
-+
-+                    return DeltaMessage(tool_calls=[
-+                        DeltaToolCall(
-+                            index=self.current_tool_index,
-+                            function=DeltaFunctionCall(
-+                                arguments=delta_escaped + '"'),
-+                        )
-+                    ])
-+                else:
-+                    # Continue accumulating value
-+                    value_chunk = delta_text
-+
-+                    # Handle first chunk after param name
-+                    if not self.current_param_value and ">" in value_chunk:
-+                        gt_idx = value_chunk.find(">")
-+                        value_chunk = value_chunk[gt_idx + 1:]
-+
-+                    if not self.current_param_value and value_chunk.startswith(
-+                            "\n"):
-+                        value_chunk = value_chunk[1:]
-+
-+                    if value_chunk:
-+                        # Stream the escaped delta
-+                        prev_escaped = (json.dumps(
-+                            self.current_param_value)[1:-1]
-+                                        if self.current_param_value else "")
-+                        self.current_param_value += value_chunk
-+                        full_escaped = json.dumps(
-+                            self.current_param_value)[1:-1]
-+                        delta_escaped = full_escaped[len(prev_escaped):]
-+
-+                        if delta_escaped:
-+                            return DeltaMessage(tool_calls=[
-+                                DeltaToolCall(
-+                                    index=self.current_tool_index,
-+                                    function=DeltaFunctionCall(
-+                                        arguments=delta_escaped),
-+                                )
-+                            ])
-+
-+        return None
-diff --git a/vllm/envs.py b/vllm/envs.py
-index 5c414e82d..56a8d7253 100755
---- a/vllm/envs.py
-+++ b/vllm/envs.py
-@@ -143,6 +143,10 @@ if TYPE_CHECKING:
-     VLLM_USE_CUDNN_PREFILL: bool = False
-     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
-     VLLM_LOOPBACK_IP: str = ""
-+    VLLM_XPU_FP8_DTYPE: str = "e5m2"
-+    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = False
-+    CCL_P2P_CPU: bool = False
-+    VLLM_QUANTIZE_Q40_LIB: str = "/opt/lib/vllm_int4_for_multi_arc.so"
- 
- 
- def get_default_cache_root():
-@@ -985,6 +989,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
-     # Used to force set up loopback IP
-     "VLLM_LOOPBACK_IP":
-     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
++        inputs_embeds = merge_multimodal_embeddings(
++            input_ids,
++            inputs_embeds,
++            multimodal_embeddings,
++            [
++                self.config.image_token_id,
++                self.config.video_token_id,
++                self.config.audio_token_id,
++            ],
++        )
 +
-+    # fp8 dtype for XPU platform
-+    "VLLM_XPU_FP8_DTYPE":
-+    lambda: os.environ.get("VLLM_XPU_FP8_DTYPE", "e5m2"),
++        return inputs_embeds
 +
-+    # Offload model weights to cpu before online fp8 quantization
-+    "VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT":
-+    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "0") == "1",
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            inputs_embeds = None
 +
-+    # Path for finding libs for vLLM sym_int4 quantization support
-+    "VLLM_QUANTIZE_Q40_LIB":
-+    lambda: os.environ.get("VLLM_QUANTIZE_Q40_LIB", "/opt/lib/vllm_int4_for_multi_arc.so"),
++        if (
++            self.use_deepstack
++            and inputs_embeds is not None
++            and get_pp_group().is_first_rank
++        ):
++            deepstack_input_embeds = self._get_deepstack_input_embeds(
++                inputs_embeds.size(0)
++            )
++        else:
++            deepstack_input_embeds = None
++
++        hidden_states = self.language_model.model(
++            input_ids,
++            positions,
++            intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++            # args for deepstack
++            deepstack_input_embeds=deepstack_input_embeds,
++        )
 +
-+    # Do send/recv on CPU backend
-+    "CCL_P2P_CPU":
-+    lambda: os.environ.get("CCL_P2P_CPU", "1") == "1"
- }
- 
- # --8<-- [end:env-vars-definition]
-diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
-index e9ad62aeb..acfdcfa96 100644
---- a/vllm/executor/ray_distributed_executor.py
-+++ b/vllm/executor/ray_distributed_executor.py
-@@ -84,15 +84,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
-         # be executed in a remote Ray worker. Currently this requires
-         # USE_RAY_COMPILED_DAG=True.
-         self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
--        if self.use_ray_compiled_dag:
--            assert self.use_ray_spmd_worker, (
--                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
--                "VLLM_USE_RAY_SPMD_WORKER=1")
--        if self.use_ray_spmd_worker:
--            # TODO: Support SPMD worker for non-DAG Ray executor.
--            assert self.use_ray_compiled_dag, (
--                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
--                "VLLM_USE_RAY_COMPILED_DAG=1")
- 
-         assert self.uses_ray
-         initialize_ray_cluster(self.parallel_config)
-@@ -112,7 +103,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
-         self.use_v1 = envs.VLLM_USE_V1
- 
-         self.pp_locks: Optional[List[asyncio.Lock]] = None
--        if not self.use_ray_compiled_dag:
-+        if not self.use_ray_spmd_worker:
-             self.driver_exec_method = make_async(
-                 self.driver_worker.execute_method)
- 
-diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
-index 2db0e9fee..b25fcdb87 100644
---- a/vllm/lora/punica_wrapper/punica_gpu.py
-+++ b/vllm/lora/punica_wrapper/punica_gpu.py
-@@ -7,17 +7,36 @@ Punica: Multi-Tenant LoRA Serving.
- https://arxiv.org/abs/2310.18547
- """
- 
--from typing import Optional, Union, final
-+from typing import Optional, Union, Tuple, final
-+import os
- 
- import torch
- 
- import vllm.envs as envs
- from vllm.lora.layers import LoRAMapping
-+from vllm.platforms import current_platform
- from vllm.triton_utils import HAS_TRITON
- 
--if HAS_TRITON:
-+is_xpu = current_platform.is_xpu()
-+xpu_use_triton_kernels = os.getenv("XPU_USE_TRITON_KERNELS", "0") == "1"
++        if inputs_embeds is not None and get_pp_group().is_first_rank:
++            self._clear_deepstack_input_embeds(inputs_embeds.size(0))
 +
-+if is_xpu and not xpu_use_triton_kernels:
-+    from vllm._ipex_ops import bgmv_expand_slice, bgmv_shrink, ipex_ops
-+    try:
-+        lora_expand = ipex_ops.lora_expand
-+        lora_shrink = ipex_ops.lora_shrink
-+        XPU_KERNEL_V = 1
-+    except AttributeError:
-+        bgmv_expand = ipex_ops.bgmv_expand
-+        sgmv_expand = ipex_ops.sgmv_expand
-+        sgmv_expand_slice = ipex_ops.sgmv_expand_slice
-+        sgmv_shrink = ipex_ops.sgmv_shrink
-+        XPU_KERNEL_V = 0
-+elif HAS_TRITON:
-     from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
-                                           lora_shrink)
-+    if is_xpu:
-+        XPU_KERNEL_V = 1
- 
- from .punica_base import PunicaWrapperBase
- 
-@@ -37,9 +56,9 @@ class PunicaWrapperGPU(PunicaWrapperBase):
- 
-         self.max_loras = kwargs['max_loras']
- 
--        self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
--                                                      max_num_batched_tokens,
--                                                      device=device)
-+        if not (is_xpu and XPU_KERNEL_V == 0):
-+            self.token_mapping_meta = LoRAKernelMeta.make(
-+                self.max_loras, max_num_batched_tokens, device=device)
- 
-         # When cudagraph capture size is greater than max_num_seqs (max_batches,
-         # here), V0 captures the graph as if max_num_seqs is set to
-@@ -47,21 +66,91 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-         # V1 doesn't have this problem and always respects max_num_seqs.
-         max_num_prompts = (max_batches
-                            if envs.VLLM_USE_V1 else max_num_batched_tokens)
--        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
--                                                       max_num_prompts,
--                                                       device=device)
-+        if not (is_xpu and XPU_KERNEL_V == 0):
-+            self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-+                                                           max_num_prompts,
-+                                                           device=device)
- 
-     def update_metadata(self, mapping: LoRAMapping,
-                         lora_index_to_id: list[Optional[int]], max_loras: int,
-                         vocab_size: int, extra_vocab_size: int, **kwargs):
- 
-         self.is_prefill = mapping.is_prefill
--        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
--                                   vocab_size, extra_vocab_size)
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            PunicaWrapperBase.update_metadata(self, mapping, lora_index_to_id,
-+                                              max_loras, vocab_size,
-+                                              extra_vocab_size,
-+                                              **kwargs)
-+        else:
-+            self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-+                                       vocab_size, extra_vocab_size)
-+            # Prepare cuda kernel metadata tensors
-+            self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
-+            self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
++        return hidden_states
 +
-+    def _apply_shrink_prefill(
++    def compute_logits(
 +        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: Tuple[torch.Tensor, ...],
-+        scale: float,
-+    ):
-+        #No LoRA request, so return directly
-+        if self.no_lora:
-+            return
-+        sgmv_shrink(
-+            x,
-+            w_t_all,
-+            y,
-+            *self.prefill_metadata,
-+            scale,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++
++    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=["talker.", "code2wav."],
 +        )
++        loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++        return loaded_weights
+diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
+new file mode 100644
+index 000000000..22948aee4
+--- /dev/null
++++ b/vllm/model_executor/models/qwen3_vl.py
+@@ -0,0 +1,1478 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++
++# Copyright 2025 The vLLM team.
++# Copyright 2025 The Qwen Team.
++# Copyright 2025 The HuggingFace Inc. team.
++# All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen3VL model compatible with HuggingFace weights."""
++from collections.abc import Iterable, Mapping, Sequence
++from functools import partial
++from typing import Any, Callable, Optional, Union
++
++import numpy as np
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import BatchFeature
++from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
++from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
++from transformers.models.qwen3_vl import (Qwen3VLProcessor,
++                                          Qwen3VLVideoProcessor)
++from transformers.models.qwen3_vl.configuration_qwen3_vl import (
++    Qwen3VLConfig, Qwen3VLVisionConfig)
++from transformers.video_utils import VideoMetadata
++
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.gptq import GPTQConfig
++from vllm.model_executor.layers.quantization.gptq_marlin import (
++    GPTQMarlinConfig)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalKwargsItem,
++                                    MultiModalKwargsItems, VideoItem)
++from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
++                                   MultiModalDataParser)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        PromptReplacement, PromptUpdate,
++                                        PromptUpdateDetails)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder
++from vllm.platforms import _Backend
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.config import uses_mrope
++from vllm.utils import is_list_of
++
++from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
++                         SupportsMultiModal, SupportsPP)
++from .qwen2_5_vl import (Qwen2_5_VisionAttention,
++                         Qwen2_5_VisionRotaryEmbedding,
++                         Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs,
++                         Qwen2_5_VLImagePixelInputs,
++                         Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs,
++                         Qwen2_5_VLVideoPixelInputs)
++from .qwen2_vl import Qwen2VLProcessingInfo
++from .qwen3 import Qwen3ForCausalLM, Qwen3Model
++from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
++                    maybe_prefix, merge_multimodal_embeddings)
++from .vision import get_vit_attn_backend
++
++logger = init_logger(__name__)
 +
-+    def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
-+        return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
 +
-+    def _apply_shrink_decode(
++class Qwen3_VisionPatchEmbed(nn.Module):
++
++    def __init__(
 +        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: torch.Tensor,
-+        scale: float,
-+    ):
-+        bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), scale)
++        patch_size: int = 14,
++        temporal_patch_size: int = 2,
++        in_channels: int = 3,
++        hidden_size: int = 1152,
++    ) -> None:
++        super().__init__()
++        self.patch_size = patch_size
++        self.temporal_patch_size = temporal_patch_size
++        self.hidden_size = hidden_size
++
++        kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.proj = nn.Conv3d(in_channels,
++                              hidden_size,
++                              kernel_size=kernel_size,
++                              stride=kernel_size,
++                              bias=True)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        L, C = x.shape
++        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
++                   self.patch_size)
++        x = self.proj(x).view(L, self.hidden_size)
++        return x
++
++
++class Qwen3_VisionMLP(nn.Module):
 +
-+    def _apply_expand_prefill(
++    def __init__(self,
++                 in_features: int,
++                 hidden_features: int,
++                 bias: bool = False,
++                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.linear_fc1 = ColumnParallelLinear(in_features,
++                                               hidden_features,
++                                               bias=bias,
++                                               quant_config=quant_config,
++                                               return_bias=False,
++                                               prefix=f"{prefix}.linear_fc1")
++        self.linear_fc2 = RowParallelLinear(hidden_features,
++                                            in_features,
++                                            bias=bias,
++                                            quant_config=quant_config,
++                                            return_bias=False,
++                                            prefix=f"{prefix}.linear_fc2")
++        self.act_fn = act_fn
++
++    def forward(self, x: torch.Tensor):
++        mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
++        return mlp_output
++
++
++class Qwen3_VisionBlock(nn.Module):
++
++    def __init__(
 +        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: Tuple[torch.Tensor, ...],
-+        offset_start: int,
-+        add_inputs: bool,
-+    ):
-+        #No LoRA request, so return directly
-+        if self.no_lora:
-+            return
-+
-+        sgmv_expand(
-+            x,
-+            w_t_all,
-+            y,
-+            *self.prefill_metadata,
-+            offset_start=offset_start,
-+            add_inputs=add_inputs,
-+        )
- 
--        # Prepare cuda kernel metadata tensors
--        self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
--        self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
-+    def _apply_expand_decode(
++        dim: int,
++        num_heads: int,
++        mlp_hidden_dim: int,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.norm1 = norm_layer(dim)
++        self.norm2 = norm_layer(dim)
++        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
++                                            num_heads=num_heads,
++                                            projection_size=dim,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.attn")
++        self.mlp = Qwen3_VisionMLP(dim,
++                                   mlp_hidden_dim,
++                                   act_fn=act_fn,
++                                   bias=True,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.mlp")
++
++    def forward(
++            self,
++            x: torch.Tensor,
++            cu_seqlens: torch.Tensor,
++            rotary_pos_emb: torch.Tensor,
++            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
++            seqlens: Optional[list[int]] = None,  # Only used for xFormers
++    ) -> torch.Tensor:
++        x = x + self.attn(self.norm1(x),
++                          cu_seqlens=cu_seqlens,
++                          rotary_pos_emb=rotary_pos_emb,
++                          max_seqlen=max_seqlen,
++                          seqlens=seqlens)
++
++        x = x + self.mlp(self.norm2(x))
++        return x
++
++
++class Qwen3_VisionPatchMerger(nn.Module):
++
++    def __init__(
 +        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: torch.Tensor,
-+        y_offset: Optional[int],
-+        y_slice_size: Optional[int],
-+        add_inputs: bool,
-+    ):
-+        token_lora_indices = self._get_token_lora_indices(x)
-+        bgmv_expand_slice(x, w_t_all, y, token_lora_indices, y_offset,
-+                          y_slice_size, add_inputs)
- 
-     def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                    lora_a_stacked: tuple[torch.Tensor,
-@@ -81,13 +170,20 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-         """
- 
-         x = x.view(-1, x.shape[-1])
--        lora_shrink(
--            x,
--            lora_a_stacked,
--            y,
--            *self.token_mapping_meta.meta_args(x.size(0)),
--            scale,
--        )
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            for slice_idx in range(len(lora_a_stacked)):
-+                self._apply_shrink_decode(y[slice_idx], x,
-+                                          lora_a_stacked[slice_idx], scale)
-+        else:
-+            meta_args = self.token_mapping_meta.meta_args(x.size(0))
-+
-+            lora_shrink(
-+                x,
-+                lora_a_stacked,
-+                y,
-+                *self.token_mapping_meta.meta_args(x.size(0)),
-+                scale,
-+            )
- 
-     def add_expand(self,
-                    y: torch.Tensor,
-@@ -127,17 +223,29 @@ class PunicaWrapperGPU(PunicaWrapperBase):
- 
-         assert x.ndim == 3
-         assert x.size(0) == len(output_slices)
--        num_tokens = x.size(1)  # first dimension is the num slices
--
--        lora_expand(
--            x,
--            lora_b_stacked,
--            y,
--            *self.token_mapping_meta.meta_args(num_tokens),
--            offset_start=offset_start,
--            add_inputs=True,
--        )
- 
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            # TODO fuse these kernels
-+            for slice_idx in range(len(lora_b_stacked)):
-+                self._apply_expand_decode(
-+                    y,
-+                    x[slice_idx],
-+                    lora_b_stacked[slice_idx],
-+                    offset_start,
-+                    output_slices[slice_idx],
-+                    add_inputs=add_inputs,
-+                )
-+                offset_start += output_slices[slice_idx]
-+        else:
-+            num_tokens = x.size(1)  # first dimension is the num slices
-+            lora_expand(
-+                x,
-+                lora_b_stacked,
-+                y,
-+                *self.token_mapping_meta.meta_args(num_tokens),
-+                offset_start=offset_start,
-+                add_inputs=True,
-+            )
-         y = y.view_as(y_org)
- 
-     def add_lora_embedding(self,
-@@ -159,14 +267,18 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-             add_inputs (bool): Default to True.
-         """
- 
--        lora_expand(
--            x.unsqueeze(dim=0),
--            (lora_b_stacked, ),
--            y,
--            *self.token_mapping_meta.meta_args(x.size(0)),
--            offset_start=0,
--            add_inputs=add_inputs,
--        )
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            bgmv_expand(x, lora_b_stacked, y, self._get_token_lora_indices(x),
-+                        add_inputs)
++        d_model: int,
++        context_dim: int,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        spatial_merge_size: int = 2,
++        use_postshuffle_norm: bool = False,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++
++        self.use_postshuffle_norm = use_postshuffle_norm
++        if self.use_postshuffle_norm:
++            context_dim = self.hidden_size
++
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.use_postshuffle_norm = use_postshuffle_norm
++        self.norm = norm_layer(
++            self.hidden_size if use_postshuffle_norm else context_dim)
++        self.linear_fc1 = ColumnParallelLinear(self.hidden_size,
++                                               self.hidden_size,
++                                               bias=True,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.linear_fc1")
++        self.act_fn = nn.GELU()
++        self.linear_fc2 = RowParallelLinear(self.hidden_size,
++                                            d_model,
++                                            bias=True,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.linear_fc2")
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.use_postshuffle_norm:
++            x = self.norm(x.view(-1, self.hidden_size))
 +        else:
-+            lora_expand(
-+                x.unsqueeze(dim=0),
-+                (lora_b_stacked, ),
-+                y,
-+                *self.token_mapping_meta.meta_args(x.size(0)),
-+                offset_start=0,
-+                add_inputs=add_inputs,
++            x = self.norm(x).view(-1, self.hidden_size)
++
++        x_parallel, _ = self.linear_fc1(x)
++        x_parallel = self.act_fn(x_parallel)
++        out, _ = self.linear_fc2(x_parallel)
++        return out
++
++
++class Qwen3_VisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        vision_config: Qwen3VLVisionConfig,
++        norm_eps: float = 1e-6,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = vision_config.hidden_size
++        self.num_heads = vision_config.num_heads
++        self.num_position_embeddings = vision_config.num_position_embeddings
++        self.patch_size = vision_config.patch_size
++        self.spatial_merge_size = vision_config.spatial_merge_size
++        self.spatial_merge_unit = self.spatial_merge_size**2
++        self.temporal_patch_size = vision_config.temporal_patch_size
++        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
++
++        self.patch_embed = Qwen3_VisionPatchEmbed(
++            patch_size=self.patch_size,
++            temporal_patch_size=self.temporal_patch_size,
++            in_channels=vision_config.in_channels,
++            hidden_size=self.hidden_size,
++        )
++
++        self.pos_embed = nn.Embedding(self.num_position_embeddings,
++                                      self.hidden_size)
++
++        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
++        head_dim = self.hidden_size // self.num_heads
++        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
++
++        self.blocks = nn.ModuleList([
++            Qwen3_VisionBlock(
++                dim=self.hidden_size,
++                num_heads=self.num_heads,
++                mlp_hidden_dim=vision_config.intermediate_size,
++                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
++                norm_layer=norm_layer,
++                quant_config=quant_config,
++                prefix=f"{prefix}.blocks.{layer_idx}")
++            for layer_idx in range(vision_config.depth)
++        ])
++
++        self.merger = Qwen3_VisionPatchMerger(
++            d_model=vision_config.out_hidden_size,
++            context_dim=self.hidden_size,
++            norm_layer=norm_layer,
++            spatial_merge_size=self.spatial_merge_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.merger",
++        )
++
++        self.deepstack_merger_list = nn.ModuleList([
++            Qwen3_VisionPatchMerger(
++                d_model=vision_config.out_hidden_size,
++                context_dim=self.hidden_size,
++                spatial_merge_size=self.spatial_merge_size,
++                use_postshuffle_norm=True,
++                norm_layer=norm_layer,
++                quant_config=quant_config,
++                prefix=f"{prefix}.deepstack_merger_list.{layer_idx}")
++            for layer_idx in range(len(self.deepstack_visual_indexes))
++        ])
++
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.proj.weight.device
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
 +            )
- 
-     def add_lora_linear(self,
-                         y: torch.Tensor,
-@@ -269,11 +381,19 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-                                  dtype=torch.float32,
-                                  device=x.device)
- 
--        lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
--                    *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
--
--        lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
--                    y,
--                    *self.prompt_mapping_meta.meta_args(buffer.size(0)),
--                    add_inputs=True)
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
-+            bgmv_expand(buffer,
-+                        lora_b_stacked,
-+                        y,
-+                        self.sampler_indices,
-+                        add_inputs=True)
-+        else:
-+            lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
-+                        *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
-+
-+            lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
-+                        y,
-+                        *self.prompt_mapping_meta.meta_args(buffer.size(0)),
-+                        add_inputs=True)
-         y = y.view_as(y_org)
-diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
-index 4a6a3b95e..6785373ae 100644
---- a/vllm/model_executor/layers/fused_moe/layer.py
-+++ b/vllm/model_executor/layers/fused_moe/layer.py
-@@ -327,7 +327,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-             layer.w13_weight.data = shuffled_w13
-             layer.w2_weight.data = shuffled_w2
- 
--        if current_platform.is_cpu():
-+        if current_platform.is_xpu():
-+            import intel_extension_for_pytorch as ipex
-+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-+                layer.w13_weight,
-+                layer.w2_weight,
-+                use_prepack=True,
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
 +            )
-+        elif current_platform.is_cpu():
-             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                 from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-                 dtype = layer.w13_weight.dtype
-@@ -501,6 +508,29 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-             activation,
-         )
- 
-+    def forward_xpu(
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(
++                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def fast_pos_embed_interpolate(self, grid_thw):
++        num_grid_per_side = int(self.num_position_embeddings**0.5)
++
++        idx_list = [[] for _ in range(4)]
++        weight_list = [[] for _ in range(4)]
++
++        for t, h, w in grid_thw:
++            h_idxs = torch.linspace(0,
++                                    num_grid_per_side - 1,
++                                    h,
++                                    dtype=torch.float32)
++            w_idxs = torch.linspace(0,
++                                    num_grid_per_side - 1,
++                                    w,
++                                    dtype=torch.float32)
++
++            h_idxs_floor = h_idxs.to(torch.long)
++            w_idxs_floor = w_idxs.to(torch.long)
++            h_idxs_ceil = torch.clamp(h_idxs.to(torch.long) + 1,
++                                      max=num_grid_per_side - 1)
++            w_idxs_ceil = torch.clamp(w_idxs.to(torch.long) + 1,
++                                      max=num_grid_per_side - 1)
++
++            dh = h_idxs - h_idxs_floor
++            dw = w_idxs - w_idxs_floor
++
++            idx_list[0].extend(((h_idxs_floor * num_grid_per_side)[None].T +
++                                w_idxs_floor[None]).flatten().tolist() * t)
++            idx_list[1].extend(((h_idxs_floor * num_grid_per_side)[None].T +
++                                w_idxs_ceil[None]).flatten().tolist() * t)
++            idx_list[2].extend(((h_idxs_ceil * num_grid_per_side)[None].T +
++                                w_idxs_floor[None]).flatten().tolist() * t)
++            idx_list[3].extend(((h_idxs_ceil * num_grid_per_side)[None].T +
++                                w_idxs_ceil[None]).flatten().tolist() * t)
++
++            weight_list[0].extend(
++                ((1 - dh)[None].T * (1 - dw)[None]).flatten().tolist() * t)
++            weight_list[1].extend(
++                ((1 - dh)[None].T * dw[None]).flatten().tolist() * t)
++            weight_list[2].extend(
++                (dh[None].T * (1 - dw)[None]).flatten().tolist() * t)
++            weight_list[3].extend(
++                (dh[None].T * dw[None]).flatten().tolist() * t)
++
++        device = self.pos_embed.weight.device
++        dtype = self.pos_embed.weight.dtype
++
++        p0 = self.pos_embed(
++            torch.tensor(
++                idx_list[0], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[0], dtype=dtype, device=device)[:, None]
++        p1 = self.pos_embed(
++            torch.tensor(
++                idx_list[1], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[1], dtype=dtype, device=device)[:, None]
++        p2 = self.pos_embed(
++            torch.tensor(
++                idx_list[2], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[2], dtype=dtype, device=device)[:, None]
++        p3 = self.pos_embed(
++            torch.tensor(
++                idx_list[3], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[3], dtype=dtype, device=device)[:, None]
++
++        patch_pos_embeds = p0 + p1 + p2 + p3
++        patch_pos_embeds = patch_pos_embeds.split(
++            [t * h * w for t, h, w in grid_thw])
++        patch_pos_embeds_permute = []
++        m_size = self.spatial_merge_size
++        for pos_embed, (t, h, w) in zip(patch_pos_embeds, grid_thw):
++            pos_embed = pos_embed.view(t, h // m_size, m_size, w // m_size,
++                                       m_size, -1).permute(0, 1, 3, 2, 4,
++                                                           5).flatten(0, 4)
++            patch_pos_embeds_permute.append(pos_embed)
++        patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
++        return patch_pos_embeds
++
++    def compute_attn_mask_seqlen(
++        self,
++        cu_seqlens: torch.Tensor,
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
++
++    def forward(
 +        self,
-+        layer: torch.nn.Module,
 +        x: torch.Tensor,
-+        use_grouped_topk: bool,
-+        top_k: int,
-+        router_logits: torch.Tensor,
-+        renormalize: bool,
-+        topk_group: Optional[int] = None,
-+        num_expert_group: Optional[int] = None,
-+        custom_routing_function: Optional[Callable] = None,
-+        **kwargs,
-+    ):
-+        return layer.ipex_fusion(
-+            x,
-+            use_grouped_topk,
-+            top_k,
-+            router_logits,
-+            renormalize,
-+            topk_group,
-+            num_expert_group,
-+            custom_routing_function=custom_routing_function)
++        grid_thw: list[list[int]],
++    ) -> torch.Tensor:
++        hidden_states = x.to(device=self.device, dtype=self.dtype)
++        hidden_states = self.patch_embed(hidden_states)
++
++        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
++        hidden_states = hidden_states + pos_embeds
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
++                dim=0,
++                dtype=grid_thw.dtype
++                if torch.jit.is_tracing() else torch.int32,
++            )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
++
++        hidden_states = hidden_states.unsqueeze(1)
++        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
++
++        deepstack_feature_lists = []
++        for layer_num, blk in enumerate(self.blocks):
++            hidden_states = blk(hidden_states,
++                                cu_seqlens=cu_seqlens,
++                                rotary_pos_emb=rotary_pos_emb,
++                                max_seqlen=max_seqlen,
++                                seqlens=seqlens)
++            if layer_num in self.deepstack_visual_indexes:
++                deepstack_merger_idx = self.deepstack_visual_indexes.index(
++                    layer_num)
++                deepstack_feature = self.deepstack_merger_list[
++                    deepstack_merger_idx](hidden_states)
++                deepstack_feature_lists.append(deepstack_feature)
++        hidden_states = self.merger(hidden_states)
++        hidden_states = torch.cat(
++            [hidden_states] + deepstack_feature_lists,
++            dim=1)  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
++        return hidden_states
 +
-     def forward_tpu(
-         self,
-         layer: torch.nn.Module,
-@@ -544,6 +574,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-         forward_native = forward_tpu
-     elif current_platform.is_cpu():
-         forward_native = forward_cpu
-+    elif current_platform.is_xpu():
-+        forward_native = forward_xpu
-     else:
-         forward_native = forward_cuda
- 
-diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
-index 95aea912a..f9dab92f2 100644
---- a/vllm/model_executor/layers/quantization/__init__.py
-+++ b/vllm/model_executor/layers/quantization/__init__.py
-@@ -37,6 +37,7 @@ QuantizationMethods = Literal[
-     "auto-round",
-     "rtn",
-     "inc",
-+    "sym_int4"
- ]
- QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
- 
-@@ -116,6 +117,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
-     from .rtn import RTNConfig
-     from .torchao import TorchAOConfig
-     from .tpu_int8 import Int8TpuConfig
-+    from .sym_int4 import SymInt4Config
- 
-     method_to_config: dict[str, type[QuantizationConfig]] = {
-         "aqlm": AQLMConfig,
-@@ -148,6 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
-         "auto-round": AutoRoundConfig,
-         "rtn": RTNConfig,
-         "inc": INCConfig,
-+        "sym_int4": SymInt4Config,
-     }
-     # Update the `method_to_config` with customized quantization methods.
-     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
-diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
-index 75f8adf34..73f8488dc 100644
---- a/vllm/model_executor/layers/quantization/fp8.py
-+++ b/vllm/model_executor/layers/quantization/fp8.py
-@@ -197,7 +197,7 @@ class Fp8LinearMethod(LinearMethodBase):
-         self.use_marlin = (not current_platform.has_device_capability(89)
-                            or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-         # Disable marlin for rocm
--        if current_platform.is_rocm():
-+        if current_platform.is_rocm() or current_platform.is_xpu():
-             self.use_marlin = False
- 
-         # AITER is only supported on ROCm and only for FP8_FNUZ
-@@ -278,10 +278,14 @@ class Fp8LinearMethod(LinearMethodBase):
-                         if self.quant_config.is_checkpoint_fp8_serialized else
-                         params_dtype)
- 
-+        # Force offloading weights to cpu if VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
-+        # enabled, otherwise use original device config which can be gpu or cpu
-+        # (may happen when cpu_offload_gb > 0)
-         weight = ModelWeightParameter(data=torch.empty(
-             output_size_per_partition,
-             input_size_per_partition,
--            dtype=weight_dtype),
-+            dtype=weight_dtype,
-+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
-                                       input_dim=1,
-                                       output_dim=0,
-                                       weight_loader=weight_loader)
-@@ -363,12 +367,15 @@ class Fp8LinearMethod(LinearMethodBase):
-                                                requires_grad=False)
- 
-         # If checkpoint not serialized fp8, quantize the weights.
--        elif not self.quant_config.is_checkpoint_fp8_serialized:
-+        if not self.quant_config.is_checkpoint_fp8_serialized:
-             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
-                                                          scale=None)
- 
-             # Update the layer with the new values.
--            layer.weight = Parameter(qweight.t(), requires_grad=False)
-+            if current_platform.is_xpu():
-+                layer.weight = Parameter(qweight, requires_grad=False)
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("attn.qkv.", "attn.q.", "q"),
++            ("attn.qkv.", "attn.k.", "k"),
++            ("attn.qkv.", "attn.v.", "v"),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: set[str] = set()
++
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
 +            else:
-+                layer.weight = Parameter(qweight.t(), requires_grad=False)
-             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-             layer.input_scale = None
- 
-@@ -434,6 +441,14 @@ class Fp8LinearMethod(LinearMethodBase):
-               layer: torch.nn.Module,
-               x: torch.Tensor,
-               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-+        if not hasattr(layer, "weight_scale"):
-+            return F.linear(x, layer.weight, bias)
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
 +
-+        if current_platform.is_xpu():
-+            weight = layer.weight.data
-+            scale = layer.weight_scale.data
-+            output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True, scale, bias)
-+            return output
- 
-         if self.use_marlin:
-             return apply_fp8_marlin_linear(
-@@ -587,8 +602,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-             num_experts,
-             2 * intermediate_size_per_partition,
-             hidden_size,
--            dtype=params_dtype),
-+            dtype=params_dtype,
-+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
-                                         requires_grad=False)
 +
-         layer.register_parameter("w13_weight", w13_weight)
-         set_weight_attrs(w13_weight, extra_weight_attrs)
- 
-@@ -596,7 +613,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-             num_experts,
-             hidden_size,
-             intermediate_size_per_partition,
--            dtype=params_dtype),
-+            dtype=params_dtype,
-+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
-                                        requires_grad=False)
-         layer.register_parameter("w2_weight", w2_weight)
-         set_weight_attrs(w2_weight, extra_weight_attrs)
-@@ -768,6 +786,23 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-                                                       requires_grad=False)
-                 layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                      requires_grad=False)
-+            
-+            if current_platform.is_xpu():
-+                import intel_extension_for_pytorch as ipex
-+                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-+                    layer.w13_weight,
-+                    layer.w2_weight,
-+                    w1_scale_inv=(layer.w13_weight_scale_inv
-+                        if self.block_quant else layer.w13_weight_scale),
-+                    w2_scale_inv=(layer.w2_weight_scale_inv
-+                        if self.block_quant else layer.w2_weight_scale),
-+                    a1_scale_inv=layer.w13_input_scale,
-+                    a2_scale_inv=layer.w2_input_scale,
-+                    use_prepack=True,
-+                )
++class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
 +
-+            return
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3VLConfig)
 +
-         # If checkpoint is fp8, we need to handle that the
-         # MoE kernels require single activation scale and single weight
-         # scale for w13 per expert.
-@@ -936,6 +971,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-         logical_to_physical_map: Optional[torch.Tensor] = None,
-         logical_replica_count: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-+        if current_platform.is_xpu():
-+            return self.forward_xpu(
-+                x=x,
-+                layer=layer,
-+                router_logits=router_logits,
-+                top_k=top_k,
-+                renormalize=renormalize,
-+                use_grouped_topk=use_grouped_topk,
-+                topk_group=topk_group,
-+                num_expert_group=num_expert_group,
-+                global_num_experts=global_num_experts,
-+                expert_map=expert_map,
-+                custom_routing_function=custom_routing_function,
-+                scoring_func=scoring_func,
-+                e_score_correction_bias=e_score_correction_bias,
-+                activation=activation,
-+                apply_router_weight_on_input=apply_router_weight_on_input)
-+
-         if enable_eplb:
-             assert expert_load_view is not None
-             assert logical_to_physical_map is not None
-@@ -1042,6 +1095,30 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-                 a2_scale=layer.w2_input_scale,
-             )
- 
-+    def forward_xpu(
-+            self,
-+            layer: torch.nn.Module,
-+            x: torch.Tensor,
-+            use_grouped_topk: bool,
-+            top_k: int,
-+            router_logits: torch.Tensor,
-+            renormalize: bool,
-+            topk_group: Optional[int] = None,
-+            num_expert_group: Optional[int] = None,
-+            custom_routing_function: Optional[Callable] = None,
++    def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
++        return self.ctx.get_hf_processor(
++            Qwen3VLProcessor,
++            use_fast=kwargs.pop("use_fast", True),
 +            **kwargs,
-+    ):
-+
-+        return layer.ipex_fusion(
-+            x,
-+            use_grouped_topk,
-+            top_k,
-+            router_logits,
-+            renormalize,
-+            topk_group,
-+            num_expert_group,
-+            custom_routing_function=custom_routing_function,
 +        )
- 
- class Fp8KVCacheMethod(BaseKVCacheMethod):
-     """
-diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
-index 428e9b882..b778b4ae0 100644
---- a/vllm/model_executor/layers/quantization/ipex_quant.py
-+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
-@@ -4,6 +4,7 @@
- from typing import Any, Optional
- 
- import torch
-+import time
- 
- from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                                UnquantizedLinearMethod)
-@@ -13,11 +14,18 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
- from vllm.model_executor.layers.quantization.base_config import (
-     QuantizationConfig)
- from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-+from vllm.model_executor.parameter import (ModelWeightParameter,
-+                                           PerTensorScaleParameter)
-+from vllm.model_executor.utils import set_weight_attrs
- from vllm.platforms import current_platform
- 
--MIN_IPEX_VERSION = "2.6.0"
-+from vllm.model_executor.layers.quantization import register_quantization_config
-+from vllm.model_executor.layers.quantization import get_quantization_config
- 
- 
-+MIN_IPEX_VERSION = "2.7.0"
-+ACTIVATION_SCHEMES = ["static", "dynamic"]
 +
- class IPEXConfig(QuantizationConfig):
-     """INT8 quantization config class using IPEX for the CPU/XPU backend,
-     including AWQ, GPTQ.
-@@ -36,6 +44,7 @@ class IPEXConfig(QuantizationConfig):
-         modules_to_not_convert: Optional[list[str]] = None,
-         desc_act: Optional[bool] = None,
-         lm_head_quantized: Optional[bool] = None,
-+        is_checkpoint_fp8_serialized: bool = False,
-     ) -> None:
-         super().__init__()
-         self.method = method
-@@ -45,14 +54,15 @@ class IPEXConfig(QuantizationConfig):
-         self.desc_act = desc_act
-         self.lm_head_quantized = lm_head_quantized
-         self.pack_factor = 32 // self.weight_bits
--
--        if self.weight_bits not in [4]:
--            raise ValueError(f"IPEX quantization supports weight bits [4], "
--                             f"but got {self.weight_bits}.")
--
--        if self.method not in ["awq", "gptq"]:
--            raise ValueError(f"IPEX quantization supports [awq, gptq], "
-+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-+        if self.method not in ["awq", "gptq", "auto-round", "fp8"]:
-+            raise ValueError(f"IPEX quantization supports [awq, gptq, auto-round, fp8], "
-                              f"but got {self.method}.")
-+        if is_checkpoint_fp8_serialized:
-+            self.quant_method = "fp8"
-+            print("Detected fp8 checkpoint. Please note that the "
-+                   "format is experimental and subject to change.")
-+        self.activation_scheme = "dynamic"
- 
-     def __repr__(self) -> str:
-         return (f"IPEXConfig(method={self.method},"
-@@ -94,9 +104,13 @@ class IPEXConfig(QuantizationConfig):
-         group_size = cls.get_from_keys(config, ["group_size"])
-         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                  default=False)
-+        data_type = cls.get_from_keys_or(config, ["data_type"],
-+                                      default="int4")
-+        is_checkpoint_fp8_serialized = ("fp8" in data_type)
++    def get_tokenizer(self):
++        return self.ctx.tokenizer
 +
-         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-         return cls(method, weight_bits, group_size, [], desc_act,
--                   lm_head_quantized)
-+                   lm_head_quantized, is_checkpoint_fp8_serialized)
- 
-     @classmethod
-     def override_quantization_method(
-@@ -106,7 +120,7 @@ class IPEXConfig(QuantizationConfig):
- 
-         quant_method = hf_quant_cfg.get("quant_method", "").lower()
- 
--        if quant_method in ["awq", "gptq"]:
-+        if quant_method in ["awq", "gptq", "auto-round", "fp8"]:
-             return cls.get_name()
- 
-         return None
-@@ -120,8 +134,84 @@ class IPEXConfig(QuantizationConfig):
-                 return IPEXAWQLinearMethod(self)
-             if self.method == "gptq":
-                 return IPEXGPTQLinearMethod(self)
-+            if self.method == "auto-round" or self.method == "fp8":
-+                return IPEXAutoRoundLinearMethod(self)
-         return None
- 
-+class IPEXAutoRoundLinearMethod(LinearMethodBase):
-+    def __init__(self, quant_config: IPEXConfig):
-+        self.quant_config = quant_config
-+        self.out_dtype = torch.get_default_dtype()
++    def get_image_processor(self,
++                            **kwargs: object) -> Qwen2VLImageProcessorFast:
++        return self.get_hf_processor(**kwargs).image_processor
 +
-+    def create_weights(
-+            self,
-+            layer: torch.nn.Module,
-+            input_size_per_partition: int,
-+            output_partition_sizes: list[int],
-+            input_size: int,
-+            output_size: int,
-+            params_dtype: torch.dtype,
-+            **extra_weight_attrs,
-+    ):
-+        # maybe_create_device_identity()
-+
-+        output_size_per_partition = sum(output_partition_sizes)
-+        weight_loader = extra_weight_attrs.get("weight_loader")
-+        layer.logical_widths = output_partition_sizes
-+
-+        layer.input_size_per_partition = input_size_per_partition
-+        layer.output_size_per_partition = output_size_per_partition
-+        layer.orig_dtype = params_dtype
-+        # WEIGHT
-+        weight_dtype = (torch.float8_e5m2
-+                        if self.quant_config.is_checkpoint_fp8_serialized else
-+                        params_dtype)
-+
-+        weight = ModelWeightParameter(data=torch.empty(
-+            output_size_per_partition,
-+            input_size_per_partition,
-+            dtype=weight_dtype),
-+            input_dim=1,
-+            output_dim=0,
-+            weight_loader=weight_loader)
-+        layer.register_parameter("weight", weight)
-+
-+        # If checkpoint is serialized fp8, load them.
-+        # Otherwise, wait until process_weights_after_loading.
-+        if self.quant_config.is_checkpoint_fp8_serialized:
-+            # WEIGHT SCALE
-+            scale = PerTensorScaleParameter(
-+                data=torch.empty(len(output_partition_sizes),
-+                                 dtype=torch.float32),
-+                weight_loader=weight_loader,
-+            )
-+            scale[:] = torch.finfo(torch.float32).min
-+            set_weight_attrs(scale, {"scale_type": "weight_scale"})
-+            set_weight_attrs(scale, {"needs_scalar_to_array": True})
-+            layer.register_parameter("weight_scale", scale)
-+            # INPUT ACTIVATION SCALE
-+            if self.quant_config.activation_scheme == "static":
-+                scale = PerTensorScaleParameter(data=torch.empty(
-+                    1, dtype=torch.float32),
-+                    weight_loader=weight_loader)
-+
-+                scale[:] = torch.finfo(torch.float32).min
-+                set_weight_attrs(scale, {"scale_type": "input_scale"})
-+                layer.register_parameter("input_scale", scale)
-+            else:
-+                layer.register_parameter("input_scale", None)
++    def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor:
++        return self.get_hf_processor(**kwargs).video_processor
 +
-+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-+        pass
-+
-+    def apply(self,
-+              layer: torch.nn.Module,
-+              x: torch.Tensor,
-+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-+        weight = layer.weight.data
-+        scale = layer.weight_scale.data
-+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True, scale, bias)
-+        return output
- 
- class IPEXGPTQLinearMethod(GPTQLinearMethod):
-     """GPTQ linear method using IPEX for the CPU/XPU backend.
-diff --git a/vllm/model_executor/layers/quantization/sym_int4.py b/vllm/model_executor/layers/quantization/sym_int4.py
-new file mode 100644
-index 000000000..10d1d3d56
---- /dev/null
-+++ b/vllm/model_executor/layers/quantization/sym_int4.py
-@@ -0,0 +1,223 @@
-+# SPDX-License-Identifier: Apache-2.0
++    def _get_vision_info(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int = 2,
++        do_resize: bool = True,
++        image_processor: Optional[Qwen2VLImageProcessorFast],
++    ) -> tuple[ImageSize, int]:
++        if image_processor is None:
++            image_processor = self.get_image_processor()
++
++        hf_config = self.get_hf_config()
++        vision_config = hf_config.vision_config
++        patch_size = vision_config.patch_size
++        merge_size = vision_config.spatial_merge_size
++        temporal_patch_size = vision_config.temporal_patch_size
++
++        if do_resize:
++            resized_height, resized_width = smart_resize(
++                height=image_height,
++                width=image_width,
++                factor=patch_size * merge_size,
++                min_pixels=image_processor.size["shortest_edge"],
++                max_pixels=image_processor.size["longest_edge"],
++            )
++            preprocessed_size = ImageSize(width=resized_width,
++                                          height=resized_height)
++        else:
++            preprocessed_size = ImageSize(width=image_width,
++                                          height=image_height)
++
++        padded_num_frames = num_frames + num_frames % temporal_patch_size
++
++        grid_t = max(padded_num_frames // temporal_patch_size, 1)
++        grid_h = preprocessed_size.height // patch_size
++        grid_w = preprocessed_size.width // patch_size
++
++        num_patches = grid_t * grid_h * grid_w
++        num_vision_tokens = num_patches // (merge_size**2)
++
++        return preprocessed_size, num_vision_tokens
++
++    def _calculate_timestamps(self, indices: list[int] | torch.Tensor,
++                              video_fps: float, merge_size: int):
++        if not isinstance(indices, list):
++            indices = indices.tolist()
++        if len(indices) % merge_size != 0:
++            # don't update metadata's frames_indices directly
++            indices = indices + [indices[-1]
++                                 ] * (merge_size - len(indices) % merge_size)
++        timestamps = [idx / video_fps for idx in indices]
++        timestamps = [(timestamps[i] + timestamps[i + merge_size - 1]) / 2
++                      for i in range(0, len(timestamps), merge_size)]
++        return timestamps
++
++    def _get_video_second_idx(
++            self,
++            metadata: dict[str, Any],
++            out_item: MultiModalKwargsItem,
++            do_sample_frames: Optional[bool] = None,
++            sampled_fps: Optional[float] = None) -> list[int]:
++        video_processor = self.get_video_processor()
++        merge_size = video_processor.merge_size
++        indices = metadata["frames_indices"]
++
++        # metadata["fps"] refers to the true fps of the input video.
++        video_fps = metadata["fps"]
++        if do_sample_frames is None:
++            do_sample_frames = metadata.get("do_sample_frames", False)
++
++        # If video frames are sampled in HF processor (instead of vLLM
++        # video loader), we need to re-calculate the indices from original
++        # metadata.
++        if do_sample_frames:
++            # here video_fps is the fps of the sampled video, and
++            # metadata["fps"] refers to the fps of the original video.
++            video_fps = sampled_fps if sampled_fps else video_processor.fps
++            total_num_frames = metadata["total_num_frames"]
++            num_frames = int(total_num_frames / metadata["fps"] * video_fps)
++            num_frames = min(
++                min(max(num_frames, video_processor.min_frames),
++                    video_processor.max_frames), total_num_frames)
++            indices = np.linspace(0, total_num_frames - 1,
++                                  num_frames).round().astype(int).tolist()
++        timestamps = self._calculate_timestamps(indices, video_fps, merge_size)
++        return timestamps
++
++
++class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
 +
-+from typing import Any, Dict, List, Optional, Tuple
-+# from vllm.model_executor.layers.quantization import register_quantization_config
-+from vllm.model_executor.layers.quantization.base_config import (
-+    QuantizationConfig, QuantizeMethodBase)
-+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-+                                               UnquantizedLinearMethod)
-+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
-+                                           ModelWeightParameter,
-+                                           PerTensorScaleParameter)
++    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
 +
-+import torch
-+from torch.nn import Module
-+from torch.nn.parameter import Parameter
-+# from vllm.model_executor.layers.quantization.ipex_quant import MIN_IPEX_VERSION
++        image_token = "<|vision_start|><|image_pad|><|vision_end|>"
++        video_token = "<|vision_start|><|video_pad|><|vision_end|>"
 +
-+from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT, VLLM_QUANTIZE_Q40_LIB
-+import ctypes
++        return image_token * num_images + video_token * num_videos
 +
-+MIN_IPEX_VERSION = "2.5.0"
-+class SymInt4Config(QuantizationConfig):
-+    """SYM_INT4 quantization config class which uses IPEX kernel behind the scene...
-+    The weight will be quantized according to GPTQ setups...
-+    """
-+    def __init__(
++    def get_dummy_mm_data(
 +        self,
-+    ) -> None:
-+        super().__init__()
-+
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> MultiModalDataDict:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
 +
-+    @classmethod
-+    def get_name(cls) -> str:
-+        return "sym_int4"
++        target_width, target_height = (
++            self.info.get_image_size_with_most_features())
++        target_num_frames = self.info.get_num_frames_with_most_features(
++            seq_len, mm_counts)
++        return {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++            "video":
++            self._get_dummy_videos(
++                width=target_width,
++                height=target_height,
++                num_frames=target_num_frames,
++                num_videos=num_videos,
++            ),
++        }
 +
++    def _get_dummy_videos(
++        self,
++        *,
++        width: int,
++        height: int,
++        num_frames: int,
++        num_videos: int,
++    ) -> list[VideoItem]:
++        num_frames = max(num_frames, 2)
++        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
++        video_items = []
++        for i in range(num_videos):
++            video_metadata = {
++                "fps": 2.0,
++                "duration": num_frames / 2.0,
++                "total_num_frames": num_frames,
++                "frames_indices": [i for i in range(num_frames)],
++                "video_backend": "opencv",
++                "do_sample_frames": False,
++            }
++            video_item = (video.copy(), video_metadata)
++            video_items.append(video_item)
++        return video_items
 +
-+    @classmethod
-+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-+        return [torch.half]
 +
++class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
++                                 ):
 +
-+    @classmethod
-+    def get_min_capability(cls) -> int:
-+        # TODO: check if this will affect things...
-+        # May need to check platform xpu
-+        return -1
++    def _get_data_parser(self) -> MultiModalDataParser:
++        return MultiModalDataParser(video_needs_metadata=True)
 +
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++        tok_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        processor = self.info.get_hf_processor(**mm_kwargs)
++
++        # Separate video processing from image processing. Because the videos
++        # are processed into serval image patches
++        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
++                and len(mm_data["videos"]) > 0):
++            video_grid_thw_lst = []
++            pixel_values_videos_lst = []
++
++            for item_idx, item in enumerate(mm_data.pop("videos", [])):
++                video_array, metadata = item
++
++                # NOTE: @JJJYmmm new attr metadata.frames_indices indicates
++                # the sampled frames indices of pre-sampled videos, which is
++                # used to calculate the timestamps. Make sure that
++                # do_sample_frames in mm_kwargs is false for presampled videos.
++
++                # NOTE: a copy of is created to update do_sample_frames,
++                # otherwise mm_hash for the object will be incorrect.
++                video_mm_kwargs = dict(**mm_kwargs)
++                if "do_sample_frames" not in video_mm_kwargs:
++                    # qwen_vl_utils already has "do_sample_frames" in
++                    # mm_kwargs, don't overwrite it.
++                    video_mm_kwargs["do_sample_frames"] = metadata.get(
++                        "do_sample_frames", False)
++
++                metadata = VideoMetadata(**{
++                    k: metadata[k]
++                    for k in metadata if k != "do_sample_frames"
++                })
++
++                video_mm_data = dict()
++                video_mm_data["videos"] = [[video_array]]
++                video_mm_data["video_metadata"] = [[metadata]]
++
++                video_outputs = super()._call_hf_processor(
++                    prompt="<|vision_start|><|video_pad|><|vision_end|>",
++                    mm_data=video_mm_data,
++                    mm_kwargs=video_mm_kwargs,
++                    tok_kwargs=tok_kwargs,
++                )
++                input_ids = video_outputs.pop("input_ids")
++                video_placeholder = processor.tokenizer.batch_decode(
++                    input_ids)[0]
++                prompt = prompt.replace(
++                    "<|vision_start|><|video_pad|><|vision_end|>",
++                    video_placeholder,
++                    1,
++                )
 +
-+    @classmethod
-+    def get_config_filenames(cls) -> List[str]:
-+        return []
++                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
++                pixel_values_videos_lst.append(
++                    video_outputs["pixel_values_videos"])
++            video_outputs = dict(
++                pixel_values_videos=torch.cat(pixel_values_videos_lst),
++                video_grid_thw=torch.cat(video_grid_thw_lst),
++            )
++        else:
++            video_outputs = dict()
 +
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++            tok_kwargs=tok_kwargs,
++        )
++        combined_outputs = dict(
++            processed_outputs,
++            **video_outputs,
++        )
++        return BatchFeature(combined_outputs)
 +
-+    @classmethod
-+    def from_config(cls, config: Dict[str, Any]) -> "SymInt4Config":
-+        return cls()
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
++        image_grid_sizes = image_grid_thw.prod(-1)
 +
-+    @classmethod
-+    def get_quant_method(self, layer: torch.nn.Module,
-+                         prefix: str) -> Optional["QuantizeMethodBase"]:
-+        """Get the quantize method to use for the quantized layer.
++        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
++        video_grid_sizes = video_grid_thw.prod(-1)
 +
-+        Args:
-+            layer: The layer for the quant method.
-+            prefix: The full name of the layer in the state dict
-+        Returns:
-+            The quantize method. None if the given layer doesn't support quant
-+            method.
-+        """
-+        modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
-+        if any(key in prefix for key in modules_to_not_convert):
-+            return UnquantizedLinearMethod()
-+        if isinstance(layer, LinearBase):
-+            return SymInt4LinearMethod(self)
-+        else:
-+            return None
++        return dict(
++            pixel_values=MultiModalFieldConfig.flat_from_sizes(
++                "image", image_grid_sizes),
++            image_embeds=MultiModalFieldConfig.flat_from_sizes(
++                "image", image_grid_sizes),
++            image_grid_thw=MultiModalFieldConfig.batched("image"),
++            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
++                "video", video_grid_sizes),
++            video_embeds=MultiModalFieldConfig.flat_from_sizes(
++                "video", video_grid_sizes),
++            video_grid_thw=MultiModalFieldConfig.batched("video"),
++        )
 +
++    def _get_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargsItems,
++    ) -> Sequence[PromptUpdate]:
++        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        image_processor = self.info.get_image_processor(
++            **hf_processor_mm_kwargs)
++        tokenizer = self.info.get_tokenizer()
++        hf_config = self.info.get_hf_config()
++
++        video_token_id = hf_config.video_token_id
++        vision_start_token_id = hf_config.vision_start_token_id
++        vision_end_token_id = hf_config.vision_end_token_id
++
++        merge_length = image_processor.merge_size**2
++
++        def get_image_replacement_qwen3vl(item_idx: int):
++            out_item = out_mm_kwargs["image"][item_idx]
++            grid_thw = out_item["image_grid_thw"].data
++            assert isinstance(grid_thw, torch.Tensor)
++
++            num_tokens = int(grid_thw.prod()) // merge_length
++            return [hf_processor.image_token_id] * num_tokens
++
++        def get_video_replacement_qwen3vl(item_idx: int):
++            out_item = out_mm_kwargs["video"][item_idx]
++            grid_thw = out_item["video_grid_thw"].data
++            assert isinstance(grid_thw, torch.Tensor)
++
++            video, metadata = mm_items["video"][item_idx]
++            do_sample_frames = hf_processor_mm_kwargs.get("do_sample_frames")
++            sampled_fps = hf_processor_mm_kwargs.get("fps")
++            if is_list_of(sampled_fps, float):
++                sampled_fps = sampled_fps[item_idx]
++            timestamps = self.info._get_video_second_idx(
++                metadata, out_item, do_sample_frames, sampled_fps)
++
++            assert len(timestamps) == grid_thw[0], (
++                f"The timestamps length({len(timestamps)}) should be equal "
++                f"video length ({grid_thw[0]}).")
++
++            frames_idx_token = [
++                tokenizer.encode(f"<{curr_time:.1f} seconds>",
++                                 add_special_tokens=False)
++                for curr_time in timestamps
++            ]
++            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
++            placeholder = []
++            for frame_idx in frames_idx_token:
++                placeholder.extend(frame_idx)
++                placeholder.extend([vision_start_token_id] +
++                                   [video_token_id] * num_tokens_per_frame +
++                                   [vision_end_token_id])
++            return PromptUpdateDetails.select_token_id(placeholder,
++                                                       video_token_id)
 +
++        return [
++            PromptReplacement(
++                modality="image",
++                target=hf_processor.image_token,
++                replacement=get_image_replacement_qwen3vl,
++            ),
 +
-+class SymInt4LinearMethod(LinearMethodBase):
-+    def __init__(self, quant_config: SymInt4Config):
-+        self.quant_config = quant_config
-+        # Initialize the quant_config
-+        try:
-+            self.clib = ctypes.CDLL(VLLM_QUANTIZE_Q40_LIB)
-+        except OSError as e:
-+            raise RuntimeError(f"Failed to load required quantization lib at {VLLM_QUANTIZE_Q40_LIB}: {e}")
-+        self.clib.quantize_q4_0_to_qweight_and_scale.argtypes = [
-+            ctypes.POINTER(ctypes.c_float),
-+            ctypes.POINTER(ctypes.c_int32),
-+            ctypes.POINTER(ctypes.c_uint16),
-+            ctypes.c_int,
-+            ctypes.c_int,
++            # NOTE: We match string on purpose since searching sequence of
++            # token ids takes more time.
++            PromptReplacement(
++                modality="video",
++                target="<|vision_start|><|video_pad|><|vision_end|>",
++                replacement=get_video_replacement_qwen3vl,
++            ),
 +        ]
-+        self.clib.quantize_q4_0_to_qweight_and_scale.restype = ctypes.c_size_t
 +
-+    def ggml_quantize_tensor(self, weight: torch.Tensor, out_qweight: torch.Tensor, out_scale:torch.Tensor, out_features: int, in_features: int):
-+        # Convert src to float *
-+        # Currently, only handles dimension = 2
-+        assert(weight.dim()==2)
-+        assert out_qweight.shape == (out_features, in_features // 8)
-+        assert out_scale.shape == (out_features, in_features // 64)
 +
-+        assert weight.dtype == torch.float32
-+        assert out_qweight.dtype == torch.int32
-+        assert out_scale.dtype == torch.float16
++@support_torch_compile(
++    dynamic_arg_dims={
++        "input_ids": 0,
++        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
++        # otherwise (seq_len, ).
++        "positions": -1,
++        "intermediate_tensors": 0,
++        "inputs_embeds": 0,
++        # the same shape as input_embeds
++        "deepstack_input_embeds": 0
++    })
++class Qwen3LLMModel(Qwen3Model):
 +
-+        assert(out_qweight.is_contiguous())
-+        assert(out_scale.is_contiguous())
-+        src = weight.data.data_ptr()
-+        src = ctypes.cast(src, ctypes.POINTER(ctypes.c_float))
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        if not get_pp_group().is_first_rank:
++            assert self.start_layer >= len(
++                vllm_config.model_config.hf_config.vision_config.
++                deepstack_visual_indexes), (
++                    "start_layer should be greater than or equal to "
++                    "len(deepstack_visual_indexes)")
 +
-+        qweight = out_qweight.data.data_ptr()
-+        qweight = ctypes.cast(qweight, ctypes.POINTER(ctypes.c_int32))
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        # args for deepstack
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for layer_idx, layer in enumerate(
++                self.layers[self.start_layer:self.end_layer]):
++            layer_idx = layer_idx + self.start_layer
 +
-+        scale = out_scale.data.data_ptr()
-+        scale = ctypes.cast(scale, ctypes.POINTER(ctypes.c_uint16))
-+        self.clib.quantize_q4_0_to_qweight_and_scale(src, qweight, scale, out_features, in_features)
-+        out_qweight = out_qweight.transpose(0,1).contiguous()
-+        out_scale = out_scale.transpose(0,1).contiguous()
-+        return out_qweight, out_scale
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                residual,
++            )
 +
++            if deepstack_input_embeds is not None and \
++                    layer_idx in range(0, len(deepstack_input_embeds)):
++                hidden_states = hidden_states + deepstack_input_embeds[
++                    f"deepstack_input_embeds_{layer_idx}"]
 +
-+    def create_weights(
-+        self,
-+        layer: torch.nn.Module,
-+        input_size_per_partition: int,
-+        output_partition_sizes: List[int],
-+        input_size: int,
-+        output_size: int,
-+        params_dtype: torch.dtype,
-+        **extra_weight_attrs,
-+    ):
-+        output_size_per_partition = sum(output_partition_sizes)
-+        weight_loader = extra_weight_attrs.get("weight_loader")
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
 +
-+        layer.logical_widths = output_partition_sizes
 +
-+        layer.input_size_per_partition = input_size_per_partition
-+        layer.output_size_per_partition = output_size_per_partition
-+        layer.orig_dtype = params_dtype
++class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
 +
-+        weight_dtype = params_dtype
-+        weight = ModelWeightParameter(data=torch.empty(
-+            output_size_per_partition,
-+            input_size_per_partition,
-+            dtype=weight_dtype,
-+            device="cpu"),
-+                                      input_dim=1,
-+                                      output_dim=0,
-+                                      weight_loader=weight_loader)
-+        layer.register_parameter("weight", weight)
-+
-+
-+    def apply(self,
-+              layer: torch.nn.Module,
-+              x: torch.Tensor,
-+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-+        # The same with the GPTQ's linear method by IPEX
-+        reshaped_x = x.reshape(-1, x.shape[-1])
-+        out = layer.ipex_qlinear(reshaped_x)
-+        if bias is not None:
-+            out.add_(bias)
-+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
-+
-+
-+    def process_weights_after_loading(self, layer: Module) -> None:
-+        weight = layer.weight.float()
-+        out_features = layer.weight.shape[0]
-+        in_features = layer.weight.shape[1]
-+
-+        qweight = torch.zeros((out_features, in_features // 8), dtype=torch.int32, device=layer.weight.device)
-+        scale = torch.zeros((out_features, in_features // 64), dtype=torch.float16, device=layer.weight.device)
-+        qweight, scale = self.ggml_quantize_tensor(weight, qweight, scale, out_features, in_features)
-+        
-+        qweight = qweight.to("xpu")
-+        scale = scale.to("xpu")
-+
-+        # Use qweight to replace weight...
-+        layer.weight = Parameter(qweight, requires_grad=False)
-+        # qweight_scale
-+        layer.weight_scale = Parameter(scale, requires_grad=False)
-+        # layer.input_scale = None
-+        try:
-+            import intel_extension_for_pytorch as ipex
-+            if ipex.__version__ < MIN_IPEX_VERSION:
-+                raise ImportError(
-+                    "intel_extension_for_pytorch version is "
-+                    "wrong. Please install "
-+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
-+        except ImportError as err:
-+            raise ImportError(
-+                "Please install "
-+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
-+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
-+                " to use IPEX-AWQ linear method.") from err
-+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
-+        # The weight will be de-packed from INT4 to INT8.
-+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
-+        # The float activation will be quantized (dynamic, per-token) to INT8.
-+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
-+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
-+            weight_dtype=weight_dtype,
-+            lowp_mode=lowp_mode,
-+            act_quant_mode=act_quant_mode,
-+            group_size=64,
-+        )
-+        layer.ipex_output_size = layer.weight.shape[-1]
-+        g_idx = None
-+        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
-+            IPEXWeightOnlyQuantizedLinear.from_weight(
-+            layer.weight,     # weight should be on xpu...
-+            layer.weight_scale,
-+            torch.tensor([8], device=layer.weight.device, dtype=torch.int8),
-+            layer.weight.size(0),
-+            layer.ipex_output_size,
-+            qconfig=qconfig,
-+            g_idx=g_idx,
-+            bias=None,
-+            group_size=64,
-+            # For GPTQ layout
-+            quant_method=0
-+        )
-diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
-index dddd4d6a7..324645157 100644
---- a/vllm/model_executor/layers/rotary_embedding.py
-+++ b/vllm/model_executor/layers/rotary_embedding.py
-@@ -1003,6 +1003,8 @@ class MRotaryEmbedding(RotaryEmbedding):
-         assert positions.ndim == 1 or positions.ndim == 2
-         assert key is not None
- 
-+        return self.forward_xpu(positions, query, key)
-+        '''
-         num_tokens = positions.shape[-1]
-         cos_sin = self.cos_sin_cache[positions]
-         cos, sin = cos_sin.chunk(2, dim=-1)
-@@ -1034,6 +1036,7 @@ class MRotaryEmbedding(RotaryEmbedding):
-         key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
-         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-         return query, key
-+        '''
- 
-     @classmethod
-     def get_input_positions(
-diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
-index 4b30336f0..5298ed666 100644
---- a/vllm/model_executor/model_loader/utils.py
-+++ b/vllm/model_executor/model_loader/utils.py
-@@ -16,6 +16,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
- from vllm.attention import Attention
- from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
-                          set_current_vllm_config)
-+from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
- from vllm.logger import init_logger
- from vllm.model_executor.layers.linear import QKVCrossParallelLinear
- from vllm.model_executor.layers.quantization.base_config import (
-@@ -28,6 +29,7 @@ from vllm.model_executor.models.interfaces import SupportsQuant
- from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                  _TRANSFORMERS_MODELS)
- from vllm.utils import is_pin_memory_available
-+from vllm.model_executor.layers.quantization.sym_int4 import SymInt4LinearMethod
- 
- logger = init_logger(__name__)
- 
-@@ -99,7 +101,9 @@ def initialize_model(
- 
- def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
-                                   target_device: torch.device) -> None:
--    for _, module in model.named_modules():
-+    # gc: Any changes here need to be added to SymInt4Config.get_quant_method
-+    modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
-+    for name, module in model.named_modules():
-         if isinstance(module, QKVCrossParallelLinear):
-             # NOTE(Isotr0py): special case for cross QKV layer because
-             # q and kv proj aren't registered as submodules intentionally
-@@ -107,12 +111,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
-             continue
-         quant_method = getattr(module, "quant_method", None)
-         if isinstance(quant_method, QuantizeMethodBase):
--            # When quant methods need to process weights after loading
--            # (for repacking, quantizing, etc), they expect parameters
--            # to be on the global target device. This scope is for the
--            # case where cpu offloading is used, where we will move the
--            # parameters onto device for processing and back off after.
--            with device_loading_context(module, target_device):
-+            # The quantization of SYM_INT4 happens on CPU instead of XPU.
-+            # We uses the parameter quantization_on_cpu=isinstance(quant_method, SymInt4LinearMethod)
-+            # to skip moving tensors to XPU
-+            with device_loading_context(module, target_device, isinstance(quant_method, SymInt4LinearMethod)):
-+                # When quant methods need to process weights after loading
-+                # (for repacking, quantizing, etc), they expect parameters
-+                # to be on the global target device. This scope is for the
-+                # case where cpu offloading is used, where we will move the
-+                # parameters onto device for processing and back off after.
-+                if any(key in name for key in modules_to_not_convert):
-+                    continue
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3ForCausalLM, self).__init__()
++        config = vllm_config.model_config.hf_config.text_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
 +
-                 quant_method.process_weights_after_loading(module)
- 
-     # Currently only used by MLA.
-@@ -128,7 +138,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
- 
- @contextmanager
- def device_loading_context(module: torch.nn.Module,
--                           target_device: torch.device):
-+                           target_device: torch.device,
-+                           quantization_on_cpu: False):
-     if target_device.type == "cpu":
-         # If target is CPU, no need to move anything
-         yield module
-@@ -137,36 +148,41 @@ def device_loading_context(module: torch.nn.Module,
-     original_device_states: dict[str, torch.device] = {}
- 
-     # Store original device states and move parameters to GPU if they're on CPU
--    for name, p in module.named_parameters():
--        if p.device.type == "cpu":
--            original_device_states[name] = p.device
--            p.data = p.data.to(target_device)
--        # Parameters already on target device are not touched
-+    if not quantization_on_cpu:
-+        for name, p in module.named_parameters():
-+            if p.device.type == "cpu":
-+                original_device_states[name] = p.device
-+                p.data = p.data.to(target_device)
-+            # Parameters already on target device are not touched
- 
-     try:
-         yield module
- 
-     finally:
--        # Restore parameters to their original devices, ignoring new parameters
--        pin_memory = is_pin_memory_available()
--        for name, p in module.named_parameters():
--            if name in original_device_states:
--                original_device: torch.device = original_device_states[name]
--                if original_device.type == "cpu":
--                    # `torch.empty_like` does not support `pin_memory` argument
--                    cpu_data = torch.empty_strided(
--                        size=p.data.size(),
--                        stride=p.data.stride(),
--                        dtype=p.data.dtype,
--                        layout=p.data.layout,
--                        device="cpu",
--                        pin_memory=pin_memory,
--                    )
--                    cpu_data.copy_(p.data)
--                    p.data = cpu_data
--                else:
--                    p.data = p.data.to(original_device)
--        # New parameters or parameters already on target device are untouched
-+        # If weights were loaded onto the CPU for FP8 online quantization, there
-+        # is no need to move them back to the original device.
-+        if not VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
-+            # Restore parameters to their original devices, ignoring new parameters # noqa: E501
-+            pin_memory = is_pin_memory_available()
-+            for name, p in module.named_parameters():
-+                if name in original_device_states:
-+                    original_device: torch.device = original_device_states[
-+                        name]
-+                    if original_device.type == "cpu":
-+                        # `torch.empty_like` does not support `pin_memory` argument # noqa: E501
-+                        cpu_data = torch.empty_strided(
-+                            size=p.data.size(),
-+                            stride=p.data.stride(),
-+                            dtype=p.data.dtype,
-+                            layout=p.data.layout,
-+                            device="cpu",
-+                            pin_memory=pin_memory,
-+                        )
-+                        cpu_data.copy_(p.data)
-+                        p.data = cpu_data
-+                    else:
-+                        p.data = p.data.to(original_device)
-+            # New parameters or parameters already on target device are untouched # noqa: E501
- 
- 
- def resolve_transformers_arch(model_config: ModelConfig,
-diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
-index 9dc6115f8..37c3a79cb 100644
---- a/vllm/model_executor/models/bert.py
-+++ b/vllm/model_executor/models/bert.py
-@@ -12,7 +12,6 @@ from vllm.attention import Attention, AttentionType
- from vllm.compilation.decorators import support_torch_compile
- from vllm.config import CacheConfig, PoolerConfig, VllmConfig
- from vllm.distributed import get_tensor_model_parallel_world_size
--from vllm.forward_context import get_forward_context
- from vllm.model_executor.layers.activation import get_act_fn
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                                QKVParallelLinear,
-@@ -60,7 +59,6 @@ class BertEmbedding(nn.Module):
-     def forward(
-         self,
-         input_ids: torch.Tensor,
--        seq_lens: torch.Tensor,
-         position_ids: torch.Tensor,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-@@ -119,7 +117,6 @@ class BertPooler(Pooler):
-         return pooled_output
- 
- 
--@support_torch_compile
- class BertEncoder(nn.Module):
- 
-     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-@@ -337,6 +334,7 @@ class BertOutput(nn.Module):
-         return hidden_states
- 
- 
-+@support_torch_compile
- class BertModel(nn.Module, SupportsQuant):
- 
-     is_pooling_model = True
-@@ -368,13 +366,9 @@ class BertModel(nn.Module, SupportsQuant):
-         if inputs_embeds is not None:
-             hidden_states = inputs_embeds
-         else:
--            attn_metadata = get_forward_context().attn_metadata
--            assert hasattr(attn_metadata, "seq_lens_tensor")
--            hidden_states = self.embeddings(
--                input_ids=input_ids,
--                seq_lens=attn_metadata.seq_lens_tensor,
--                position_ids=position_ids,
--                token_type_ids=token_type_ids)
-+            hidden_states = self.embeddings(input_ids=input_ids,
-+                                            position_ids=position_ids,
-+                                            token_type_ids=token_type_ids)
-         return self.encoder(hidden_states)
- 
-     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-@@ -447,7 +441,7 @@ class BertPoolingModel(BertModel):
-         return loaded_params
- 
- 
--class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
-+class BertEmbeddingModel(nn.Module, SupportsQuant):
-     """A model that uses Bert to provide embedding functionalities.
- 
-     This class encapsulates the BertModel and provides an interface for
-@@ -474,11 +468,13 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
-         self,
-         input_ids: Optional[torch.Tensor],
-         positions: torch.Tensor,
-+        token_type_ids: Optional[torch.Tensor] = None,
-         intermediate_tensors: Optional[IntermediateTensors] = None,
-         inputs_embeds: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-         return self.model(input_ids=input_ids,
-                           position_ids=positions,
-+                          token_type_ids=token_type_ids,
-                           inputs_embeds=inputs_embeds,
-                           intermediate_tensors=intermediate_tensors)
- 
-diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
-index 8beefb2cd..54f8c0a65 100644
---- a/vllm/model_executor/models/gemma2.py
-+++ b/vllm/model_executor/models/gemma2.py
-@@ -147,10 +147,7 @@ class Gemma2Attention(nn.Module):
-         # reference:
-         # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
-         layer_idx = extract_layer_index(prefix)
--        use_sliding_window = (layer_idx % 2 == 0 and getattr(
--            config, "interleaved_sliding_window", None) is not None)
--        sliding_window = config.interleaved_sliding_window if \
--            use_sliding_window else None
-+        sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
-         self.attn = Attention(self.num_heads,
-                               self.head_dim,
-                               self.scaling,
-diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
-index d14f5fa3d..07ad2bce4 100644
---- a/vllm/model_executor/models/gemma3_mm.py
-+++ b/vllm/model_executor/models/gemma3_mm.py
-@@ -498,8 +498,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-         self.config = config
-         self.quant_config = quant_config
-         self.multimodal_config = multimodal_config
--        self.sliding_window = getattr(config.text_config,
--                                      "interleaved_sliding_window", None)
-+        if hasattr(config, "sliding_window"):
-+            self.sliding_window = getattr(config.text_config,
-+                                    "sliding_window", None)
-+        else:
-+            self.sliding_window = getattr(config.text_config,
-+                                    "interleaved_sliding_window", None)
++        self.config = config
++        self.lora_config = lora_config
 +
- 
-         self.vision_tower = SiglipVisionModel(config.vision_config,
-                                               quant_config,
-diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
-index 0996bcf60..c06e9853f 100644
---- a/vllm/model_executor/models/glm4_1v.py
-+++ b/vllm/model_executor/models/glm4_1v.py
-@@ -259,7 +259,8 @@ class Glm4vVisionAttention(nn.Module):
-         )
- 
-         # Detect attention implementation.
--        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        # self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
-         if self.attn_backend not in {
-                 _Backend.FLASH_ATTN,
-                 _Backend.TORCH_SDPA,
-@@ -343,22 +344,51 @@ class Glm4vVisionAttention(nn.Module):
-                                       b=batch_size)
-         elif self.attn_backend == _Backend.TORCH_SDPA:
-             # Execute attention entry by entry for speed & less VRAM.
--            outputs = []
--            for i in range(1, len(cu_seqlens)):
--                start_idx = cu_seqlens[i - 1]
--                end_idx = cu_seqlens[i]
--                q_i = q[:, start_idx:end_idx]
--                k_i = k[:, start_idx:end_idx]
--                v_i = v[:, start_idx:end_idx]
--                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
--                                 for x in [q_i, k_i, v_i])
--                output_i = F.scaled_dot_product_attention(q_i,
--                                                          k_i,
--                                                          v_i,
--                                                          dropout_p=0.0)
--                output_i = rearrange(output_i, "b h s d -> b s h d ")
--                outputs.append(output_i)
--            context_layer = torch.cat(outputs, dim=1)
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+            from vllm._ipex_ops import ipex_ops
-+            output = torch.empty(
-+                        (q.shape[0], q.shape[1], q.shape[2]),
-+                        dtype=q.dtype,
-+                        device=q.device)
-+            import math
-+            head_dim = q.shape[-1]
-+            scale = 1 / math.sqrt(head_dim)
-+            ipex_ops.varlen_attention(q, k, v, output,
-+                                    cu_seqlens,
-+                                    cu_seqlens,
-+                                    None,
-+                                    max_seqlen,
-+                                    max_seqlen,
-+                                    pdropout=0,
-+                                    softmax_scale=scale,
-+                                    zero_tensors=False,
-+                                    is_causal=False,
-+                                    return_softmax=False,
-+                                    window_size_left=-1,
-+                                    window_size_right=-1,
-+                                    gen_=None,
-+                                    logits_soft_cap=0
-+                                    )
++        self.quant_config = quant_config
++        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
 +
-+            context_layer = rearrange(output,
-+                                      "(b s) ... -> b s ...",
-+                                      b=batch_size)
-+            # outputs = []
-+            # for i in range(1, len(cu_seqlens)):
-+            #     start_idx = cu_seqlens[i - 1]
-+            #     end_idx = cu_seqlens[i]
-+            #     q_i = q[:, start_idx:end_idx]
-+            #     k_i = k[:, start_idx:end_idx]
-+            #     v_i = v[:, start_idx:end_idx]
-+            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-+            #                      for x in [q_i, k_i, v_i])
-+            #     output_i = F.scaled_dot_product_attention(q_i,
-+            #                                               k_i,
-+            #                                               v_i,
-+            #                                               dropout_p=0.0)
-+            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
-+            #     outputs.append(output_i)
-+            # context_layer = torch.cat(outputs, dim=1)
-         elif self.attn_backend == _Backend.XFORMERS:
-             from xformers import ops as xops
-             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -730,7 +760,7 @@ class Glm4vVisionTransformer(nn.Module):
-     ) -> tuple[Optional[int], Optional[list[int]]]:
-         max_seqlen, seqlens = None, None
-         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
--        if self.attn_backend == _Backend.FLASH_ATTN:
-+        if self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.IPEX:
-             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         return max_seqlen, seqlens
- 
-diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
-index bdca293d2..8fa5d8b9b 100644
---- a/vllm/model_executor/models/glm4_moe.py
-+++ b/vllm/model_executor/models/glm4_moe.py
-@@ -53,7 +53,7 @@ from vllm.model_executor.model_loader.weight_utils import (
- from vllm.model_executor.sampling_metadata import SamplingMetadata
- from vllm.sequence import IntermediateTensors
- 
--from .interfaces import SupportsPP
-+from .interfaces import SupportsLoRA, SupportsPP
- from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                     make_empty_intermediate_tensors_factory, make_layers,
-                     maybe_prefix)
-@@ -118,16 +118,17 @@ class Glm4MoE(nn.Module):
-         if config.hidden_act != "silu":
-             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                              "Only silu is supported for now.")
--
--        self.gate = ReplicatedLinear(config.hidden_size,
--                                     config.n_routed_experts,
--                                     bias=False,
--                                     quant_config=None,
--                                     prefix=f"{prefix}.gate")
--
--        # noaux_tc is not set in transformers new config now
--        self.gate.e_score_correction_bias = (nn.Parameter(
--            torch.empty(config.n_routed_experts)))
-+        # NOTE In the transformers implementation, the gate isn't an nn.Linear,
-+        # so we cannot use ReplicatedLinear here.
-+        # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
-+        self.gate = nn.Linear(
-+            config.hidden_size,
-+            config.n_routed_experts,
-+            bias=False,
-+            dtype=torch.float32,
-+        )
-+        self.gate.e_score_correction_bias = nn.Parameter(
-+            torch.empty(config.n_routed_experts, dtype=torch.float32))
- 
-         # Load balancing settings.
-         vllm_config = get_current_vllm_config()
-@@ -181,7 +182,7 @@ class Glm4MoE(nn.Module):
- 
-         if self.n_shared_experts is not None:
-             shared_output = self.shared_experts(hidden_states)
--        router_logits, _ = self.gate(hidden_states)
-+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
-         final_hidden_states = self.experts(
-             hidden_states=hidden_states,
-             router_logits=router_logits) * self.routed_scaling_factor
-@@ -372,7 +373,13 @@ class Glm4MoeDecoderLayer(nn.Module):
-         return hidden_states, residual
- 
- 
--@support_torch_compile
-+@support_torch_compile(
-+    dynamic_arg_dims={
-+        "input_ids": 0,
-+        "positions": -1,
-+        "intermediate_tensors": 0,
-+        "inputs_embeds": 0,
-+    })
- class Glm4MoeModel(nn.Module):
- 
-     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-@@ -390,7 +397,6 @@ class Glm4MoeModel(nn.Module):
-             self.embed_tokens = VocabParallelEmbedding(
-                 config.vocab_size,
-                 config.hidden_size,
--                quant_config=quant_config,
-                 prefix=f"{prefix}.embed_tokens")
-         else:
-             self.embed_tokens = PPMissingLayer()
-@@ -462,6 +468,15 @@ class Glm4MoeModel(nn.Module):
-                         device=device),
-         })
- 
-+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-+        # Params for weights, fp8 weight scales, fp8 activation scales
-+        # (param_name, weight_name, expert_id, shard_id)
-+        return FusedMoE.make_expert_params_mapping(
-+            ckpt_gate_proj_name="gate_proj",
-+            ckpt_down_proj_name="down_proj",
-+            ckpt_up_proj_name="up_proj",
-+            num_experts=self.config.n_routed_experts)
-+
-     def load_weights(self, weights: Iterable[tuple[str,
-                                                    torch.Tensor]]) -> set[str]:
-         stacked_params_mapping = [
-@@ -473,16 +488,9 @@ class Glm4MoeModel(nn.Module):
-             ("gate_up_proj", "up_proj", 1),
-         ]
- 
--        # Params for weights, fp8 weight scales, fp8 activation scales
--        # (param_name, weight_name, expert_id, shard_id)
--        expert_params_mapping = FusedMoE.make_expert_params_mapping(
--            ckpt_gate_proj_name="gate_proj",
--            ckpt_down_proj_name="down_proj",
--            ckpt_up_proj_name="up_proj",
--            num_experts=self.config.n_routed_experts)
--
-         params_dict = dict(self.named_parameters())
-         loaded_params: set[str] = set()
-+        expert_params_mapping = self.get_expert_mapping()
-         for name, loaded_weight in weights:
-             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-             if spec_layer is not None:
-@@ -571,7 +579,7 @@ class Glm4MoeModel(nn.Module):
-         return loaded_params
- 
- 
--class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-+class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-     packed_modules_mapping = {
-         "qkv_proj": [
-             "q_proj",
-@@ -600,8 +608,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-                                           quant_config=quant_config)
-         else:
-             self.lm_head = PPMissingLayer()
--        if self.config.tie_word_embeddings:
--            self.lm_head.weight = self.model.embed_tokens.weight
-         self.logits_processor = LogitsProcessor(config.vocab_size)
-         self.make_empty_intermediate_tensors = (
-             self.model.make_empty_intermediate_tensors)
-@@ -613,14 +619,19 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-         self.num_expert_groups = config.n_group
- 
-         self.moe_layers: list[FusedMoE] = []
-+        example_moe = None
-         for layer in self.model.layers:
-+            if isinstance(layer, PPMissingLayer):
-+                continue
-             assert isinstance(layer, Glm4MoeDecoderLayer)
-             if isinstance(layer.mlp, Glm4MoE):
-+                # Pick last one layer since the first ones may be dense layers.
-+                example_moe = layer.mlp
-                 self.moe_layers.append(layer.mlp.experts)
- 
-         # Pick last one layer since the first ones may be dense layers.
--        example_moe = typing.cast(
--            Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp)
-+        if example_moe is None:
-+            raise RuntimeError("No Glm4MoE layer found in model.layers.")
-         self.num_logical_experts = example_moe.n_logical_experts
-         self.num_physical_experts = example_moe.n_physical_experts
-         self.num_local_physical_experts = example_moe.n_local_physical_experts
-@@ -673,6 +684,10 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-         return loader.load_weights(weights)
- 
- 
-+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-+        return self.model.get_expert_mapping()
-+
-+
- def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
-                                         weight_name: str) -> Optional[int]:
-     if hasattr(config,
-diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
-index 7584b5188..79d2700ac 100644
---- a/vllm/model_executor/models/glm4v.py
-+++ b/vllm/model_executor/models/glm4v.py
-@@ -18,6 +18,7 @@ from transformers.image_utils import ImageInput
- from transformers.tokenization_utils_base import TextInput
- 
- from vllm.attention.layer import MultiHeadAttention
-+from vllm.attention.layer import SelfMultiHeadAttention
- from vllm.config import VllmConfig
- from vllm.distributed import get_tensor_model_parallel_world_size
- from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
-@@ -112,7 +113,9 @@ class EVA2CLIPAttention(nn.Module):
-             prefix=f"{prefix}.dense",
-         )
- 
--        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-+        # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-+        #                                self.scale)
-+        self.attn = SelfMultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-                                        self.scale)
-         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
- 
-diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
-index 9e27200fb..ab8bd737f 100644
---- a/vllm/model_executor/models/idefics2_vision_model.py
-+++ b/vllm/model_executor/models/idefics2_vision_model.py
-@@ -27,6 +27,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
-     Idefics2Config, Idefics2VisionConfig)
- 
- from vllm.attention.layer import MultiHeadAttention
-+from vllm.attention.layer import SelfMultiHeadAttention
- from vllm.distributed import divide, get_tensor_model_parallel_world_size
- from vllm.model_executor.layers.activation import get_act_fn
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-@@ -146,8 +147,10 @@ class Idefics2VisionAttention(nn.Module):
-         )
-         self.tp_size = get_tensor_model_parallel_world_size()
-         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
--        self.attn = MultiHeadAttention(self.num_heads_per_partition,
--                                       self.head_dim, self.scale)
-+        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
-+        #                                self.head_dim, self.scale)
-+        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition, self.head_dim,
-+                                       self.scale)
- 
-     def forward(
-         self,
-diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
-new file mode 100644
-index 000000000..ab21cbe91
---- /dev/null
-+++ b/vllm/model_executor/models/interns1.py
-@@ -0,0 +1,832 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++        if get_pp_group().is_last_rank:
++            if config.tie_word_embeddings:
++                self.lm_head = self.model.embed_tokens
++            else:
++                self.lm_head = ParallelLMHead(config.vocab_size,
++                                              config.hidden_size,
++                                              quant_config=quant_config,
++                                              prefix="lm_head")
++        else:
++            self.lm_head = PPMissingLayer()
 +
-+# --------------------------------------------------------
-+# InternS1
-+# Copyright (c) 2025 Shanghai AI Lab
-+# Licensed under The MIT License [see LICENSE for details]
-+# --------------------------------------------------------
-+from collections.abc import Iterable, Mapping, Sequence
-+from typing import Literal, Optional, TypedDict, Union
++        self.logits_processor = LogitsProcessor(config.vocab_size)
 +
-+import regex as re
-+import torch
-+import torch.nn as nn
-+from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
-+from transformers.activations import ACT2FN
-+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
-+    GotOcr2ImageProcessorFast)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
 +
-+from vllm.config import VllmConfig
-+from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.models.interns1_vit import InternS1VisionModel
-+from vllm.model_executor.models.module_mapping import MultiModelKeys
-+from vllm.model_executor.sampling_metadata import SamplingMetadata
-+from vllm.multimodal import MULTIMODAL_REGISTRY
-+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-+                                    MultiModalKwargs, NestedTensors)
-+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-+                                   ImageSize, MultiModalDataItems)
-+from vllm.multimodal.processing import (BaseMultiModalProcessor,
-+                                        BaseProcessingInfo, PromptReplacement,
-+                                        PromptUpdate, PromptUpdateDetails)
-+from vllm.multimodal.profiling import BaseDummyInputsBuilder
-+from vllm.sequence import IntermediateTensors
 +
-+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-+                         SupportsMultiModal, SupportsPP)
-+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-+                    init_vllm_registered_model, maybe_prefix,
-+                    merge_multimodal_embeddings)
++@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
++                                        info=Qwen3VLProcessingInfo,
++                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
++class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                      SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # To ensure correct weight loading and mapping.
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "model.visual.": "visual.",
++            "lm_head.": "language_model.lm_head.",
++            "model.language_model.": "language_model.model.",
++        })
 +
++    @classmethod
++    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
++        if modality.startswith("image"):
++            return "<|vision_start|><|image_pad|><|vision_end|>"
++        if modality.startswith("video"):
++            return "<|vision_start|><|video_pad|><|vision_end|>"
 +
-+class InternS1MultiModalProjector(nn.Module):
++        raise ValueError("Only image or video modality is supported")
 +
-+    def __init__(self, config):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
 +        super().__init__()
-+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size *
-+                                       int(1 / config.downsample_ratio)**2)
-+        self.linear_1 = nn.Linear(
-+            config.vision_config.hidden_size *
-+            int(1 / config.downsample_ratio)**2,
-+            config.text_config.hidden_size)
-+        self.act = ACT2FN[config.projector_hidden_act]
-+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
-+                                  config.text_config.hidden_size)
-+
-+    def forward(self, image_features):
-+        hidden_states = self.layer_norm(image_features)
-+        hidden_states = self.linear_1(hidden_states)
-+        hidden_states = self.act(hidden_states)
-+        hidden_states = self.linear_2(hidden_states)
-+        return hidden_states
-+
++        config: Qwen3VLConfig = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
 +
-+class InternS1ImagePixelInputs(TypedDict):
-+    type: Literal["pixel_values"]
-+    pixel_values: torch.Tensor
-+    """
-+    Shape:
-+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
-+    """
++        self.config = config
++        self.multimodal_config = multimodal_config
 +
++        self.visual = Qwen3_VisionTransformer(
++            config.vision_config,
++            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
++            quant_config=self._maybe_ignore_quant_config(quant_config),
++            prefix=maybe_prefix(prefix, "visual"),
++        )
 +
-+class InternS1ImageEmbeddingInputs(TypedDict):
-+    type: Literal["image_embeds"]
-+    data: Union[torch.Tensor, list[torch.Tensor]]
-+    """
-+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
++        self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config,
++                                                  prefix=maybe_prefix(
++                                                      prefix,
++                                                      "language_model"))
 +
-+    `hidden_size` must match the hidden size of language model backbone.
-+    """
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
 +
++        self.use_deepstack = hasattr(config.vision_config,
++                                     'deepstack_visual_indexes')
++        self.deepstack_num_level = len(
++            config.vision_config.deepstack_visual_indexes
++        ) if self.use_deepstack else 0
++        # register buffer for deepstack
++        self.deepstack_input_embeds = [
++            torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
++                        config.text_config.hidden_size)
++            for _ in range(self.deepstack_num_level)
++        ] if self.use_deepstack else None
++
++    def _get_deepstack_input_embeds(self,
++                                    num_tokens: int) -> IntermediateTensors:
++        # get deepstack_input_embeds from buffer, and clear the buffer
++        return IntermediateTensors({
++            f"deepstack_input_embeds_{idx}":
++            self.deepstack_input_embeds[idx][:num_tokens]
++            for idx in range(self.deepstack_num_level)
++        })
 +
-+InternS1ImageInputs = Union[InternS1ImagePixelInputs,
-+                            InternS1ImageEmbeddingInputs]
++    def _set_deepstack_input_embeds(
++            self, deepstack_input_embeds: torch.Tensor) -> None:
++        # set deepstack_input_embeds to buffer
++        num_tokens = deepstack_input_embeds.size(1)
++        if num_tokens > self.deepstack_input_embeds[0].size(0):
++            self.deepstack_input_embeds = [
++                torch.zeros(num_tokens,
++                            self.config.text_config.hidden_size,
++                            device=self.deepstack_input_embeds[0].device,
++                            dtype=self.deepstack_input_embeds[0].dtype)
++                for _ in range(self.deepstack_num_level)
++            ]
++        for idx in range(self.deepstack_num_level):
++            self.deepstack_input_embeds[idx][:num_tokens].copy_(
++                deepstack_input_embeds[idx])
 +
++    def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
++        # clear deepstack_input_embeds in buffer
++        if num_tokens > 0:
++            for idx in range(self.deepstack_num_level):
++                self.deepstack_input_embeds[idx][:num_tokens].zero_()
 +
-+class InternS1VideoPixelInputs(TypedDict):
-+    type: Literal["pixel_values_videos"]
-+    pixel_values: torch.Tensor
-+    """
-+    Shape:
-+    `(batch_size * num_video * num_frames, num_channels, height, width)`
-+    """
-+
-+    num_patches: torch.Tensor
-+    """Shape: `(batch_size * num_images)`"""
-+
-+
-+class InternS1VideoEmbeddingInputs(TypedDict):
-+    type: Literal["video_embeds"]
-+    data: Union[torch.Tensor, list[torch.Tensor]]
-+    """
-+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
-+
-+    `hidden_size` must match the hidden size of language model backbone.
-+    """
-+
-+
-+InternS1VideoInputs = Union[InternS1VideoPixelInputs,
-+                            InternS1VideoEmbeddingInputs]
-+
-+
-+def resolve_interns1_min_max_num(
-+    min_dynamic_patch: int,
-+    max_dynamic_patch: int,
-+    dynamic_image_size: bool,
-+    use_thumbnail: bool,
-+) -> tuple[int, int]:
-+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-+
-+    if use_thumbnail and max_dynamic_patch != 1:
-+        max_dynamic_patch += 1
-+
-+    return min_dynamic_patch, max_dynamic_patch
-+
-+
-+def get_interns1_target_ratios(
-+    min_num: int,
-+    max_num: int,
-+) -> list[tuple[int, int]]:
-+    target_ratios = {(i, j)
-+                     for n in range(min_num, max_num + 1)
-+                     for i in range(1, n + 1)
-+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
-+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-+
-+
-+class InternS1ProcessingInfo(BaseProcessingInfo):
-+    """ProcessingInfo for InternS1-style models."""
-+
-+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-+        return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
-+
-+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-+        return {"image": None, "video": None}
-+
-+    def get_num_image_tokens(
-+        self,
-+        *,
-+        image_width: int,
-+        image_height: int,
-+        processor: Optional['GotOcr2ImageProcessorFast'] = None,
-+    ) -> int:
-+        if processor is None:
-+            processor = self.get_hf_processor().image_processor
-+
-+        if not isinstance(processor, GotOcr2ImageProcessorFast):
-+            raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
-+                             f'{type(processor)}')
-+        num_image_patches = processor.get_number_of_image_tokens(
-+            image_height, image_width, images_kwargs=dict())
-+        num_image_tokens = self.get_hf_processor(
-+        ).image_seq_length * num_image_patches
-+        return num_image_tokens
-+
-+    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
-+        image_processor = self.get_hf_processor().image_processor
-+        min_dynamic_patch = image_processor.min_patches
-+        max_dynamic_patch = image_processor.max_patches
-+        # HF format's InternVL processor uses `crop_to_patches` which is
-+        # equivalent to `use_thumbnail` in original format.
-+        use_thumbnail = image_processor.crop_to_patches
-+        dynamic_image_size = True
-+        min_num, max_num = resolve_interns1_min_max_num(
-+            min_dynamic_patch,
-+            max_dynamic_patch,
-+            dynamic_image_size,
-+            use_thumbnail=use_thumbnail)
-+
-+        return get_interns1_target_ratios(min_num, max_num)
-+
-+    def get_image_size_with_most_features(self) -> ImageSize:
-+        processor = self.get_hf_processor()
-+
-+        hf_config = self.ctx.get_hf_config()
-+        base_height, base_width = hf_config.vision_config.image_size
-+        target_ratios = self.resolve_target_ratios()
-+
-+        largest_feature_size, largest_feature_pinpoint = 0, None
-+        for wr, hr in target_ratios:
-+            width, height = base_width * wr, base_height * hr
-+
-+            feat_size = self.get_num_image_tokens(
-+                image_width=width,
-+                image_height=height,
-+                processor=processor.image_processor,
-+            )
-+            if feat_size > largest_feature_size:
-+                largest_feature_size = feat_size
-+                largest_feature_pinpoint = ImageSize(width=width,
-+                                                     height=height)
-+
-+        assert not (largest_feature_size == 0 or largest_feature_pinpoint
-+                    is None), ("Cannot have a largest feature size of 0!")
-+
-+        return largest_feature_pinpoint
-+
-+    def get_max_image_tokens(self) -> int:
-+        processor = self.get_hf_processor()
-+        target_width, target_height = self.get_image_size_with_most_features()
-+
-+        return self.get_num_image_tokens(
-+            image_width=target_width,
-+            image_height=target_height,
-+            processor=processor.image_processor,
-+        )
-+
-+    def get_num_frames_with_most_features(
-+        self,
-+        seq_len: int,
-+        mm_counts: Mapping[str, int],
-+    ) -> int:
-+        max_images = mm_counts.get("image", 0)
-+        max_videos = mm_counts.get("video", 0)
-+
-+        processor = self.get_hf_processor()
-+
-+        max_image_tokens = self.get_max_image_tokens() * max_images
-+        max_total_frames = (seq_len -
-+                            max_image_tokens) // processor.image_seq_length
-+        max_frames_per_video = max_total_frames // max(max_videos, 1)
-+
-+        return max(max_frames_per_video, 1)
-+
-+
-+class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
-+                                 ):
-+    """DummyInputsBuilder for InternS1-style models."""
-+
-+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-+        num_images = mm_counts.get("image", 0)
-+        num_videos = mm_counts.get("video", 0)
-+        image_token = self.info.get_hf_processor().image_token
-+        video_token = self.info.get_hf_processor().video_token
-+
-+        return image_token * num_images + video_token * num_videos
-+
-+    def get_dummy_mm_data(
-+        self,
-+        seq_len: int,
-+        mm_counts: Mapping[str, int],
-+    ) -> MultiModalDataDict:
-+        target_width, target_height = \
-+            self.info.get_image_size_with_most_features()
-+        target_num_frames = \
-+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
-+        num_images = mm_counts.get("image", 0)
-+        num_videos = mm_counts.get("video", 0)
-+
-+        config = self.info.get_hf_config()
-+        image_size_h, image_size_w = config.vision_config.image_size
-+
-+        return {
-+            "image":
-+            self._get_dummy_images(width=target_width,
-+                                   height=target_height,
-+                                   num_images=num_images),
-+            "video":
-+            self._get_dummy_videos(width=image_size_w,
-+                                   height=image_size_h,
-+                                   num_frames=target_num_frames,
-+                                   num_videos=num_videos),
-+        }
-+
-+
-+class InternS1MultiModalProcessor(
-+        BaseMultiModalProcessor[InternS1ProcessingInfo]):
-+    """ Basic image-only MultiModalProcessor for InternS1-style models."""
-+
-+    def _call_hf_processor(
-+        self,
-+        prompt: str,
-+        mm_data: Mapping[str, object],
-+        mm_kwargs: Mapping[str, object],
-+        tok_kwargs: Mapping[str, object],
-+    ) -> Mapping[str, NestedTensors]:
-+        mm_data = dict(mm_data)
-+        videos = mm_data.pop("videos", [])
-+        images = mm_data.pop("images", [])
-+        assert isinstance(videos, list)
-+        assert isinstance(images, list)
-+
-+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-+        tokenizer = hf_processor.tokenizer
-+        video_token_id = tokenizer.encode(hf_processor.video_token,
-+                                          add_special_tokens=False)
-+        assert len(video_token_id) == 1
-+        video_token_id = video_token_id[0]
-+
-+        prompt = re.sub(hf_processor.image_token, "<image_placeholder>",
-+                        prompt)
-+        prompt = re.sub(hf_processor.video_token, "<video_placeholder>",
-+                        prompt)
-+
-+        image_outputs = {}
-+        if images:
-+            image_pixel_values = []
-+            for image in images:
-+                processed_outputs = super()._call_hf_processor(
-+                    prompt=hf_processor.image_token,
-+                    mm_data={"images": image},
-+                    mm_kwargs=mm_kwargs,
-+                    tok_kwargs=tok_kwargs,
-+                )
-+                image_pixel_values.append(
-+                    processed_outputs.pop("pixel_values"))
-+
-+                input_ids = processed_outputs.pop("input_ids")
-+                image_placeholder = tokenizer.batch_decode(input_ids)[0]
-+                prompt = prompt.replace("<image_placeholder>",
-+                                        image_placeholder, 1)
-+
-+            num_patches = [len(item) for item in image_pixel_values]
-+            image_outputs: dict[str, NestedTensors] = {
-+                "pixel_values": torch.concat(image_pixel_values),
-+                "image_num_patches": torch.tensor(num_patches),
-+                "image_token_id": torch.tensor(hf_processor.image_token_id),
-+            }
-+
-+        video_outputs = {}
-+        if videos:
-+            video_pixel_values = []
-+            for video in videos:
-+                processed_outputs = super()._call_hf_processor(
-+                    prompt=hf_processor.video_token,
-+                    mm_data={"videos": video},
-+                    mm_kwargs=mm_kwargs,
-+                    tok_kwargs=tok_kwargs,
-+                )
-+                video_pixel_values.append(
-+                    processed_outputs.pop("pixel_values"))
-+
-+                input_ids = processed_outputs.pop("input_ids")
-+                input_ids[input_ids ==
-+                          hf_processor.image_token_id] = video_token_id
-+
-+                video_placeholder = tokenizer.batch_decode(input_ids)[0]
-+                prompt = prompt.replace("<video_placeholder>",
-+                                        video_placeholder, 1)
-+
-+            num_frames = [len(item) for item in video_pixel_values]
-+            video_outputs: dict[str, NestedTensors] = {
-+                "pixel_values_videos": torch.concat(video_pixel_values),
-+                "video_num_patches": torch.tensor(num_frames),
-+                "video_token_id": torch.tensor(video_token_id),
-+            }
-+
-+        prompt = re.sub("<image_placeholder>", hf_processor.image_token,
-+                        prompt)
-+        prompt = re.sub("<video_placeholder>", hf_processor.video_token,
-+                        prompt)
-+        text_outputs = tokenizer(prompt, **tok_kwargs, return_tensors="pt")
-+
-+        combined_outputs = dict(
-+            **text_outputs,
-+            **image_outputs,
-+            **video_outputs,
-+        )
-+        return BatchFeature(combined_outputs)
-+
-+    def _get_mm_fields_config(
-+        self,
-+        hf_inputs: Mapping[str, NestedTensors],
-+        hf_processor_mm_kwargs: Mapping[str, object],
-+    ) -> Mapping[str, MultiModalFieldConfig]:
-+
-+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
-+        video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
-+        num_images = len(image_num_patches)
-+        num_videos = len(video_num_patches)
-+
-+        return dict(
-+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-+                "image", image_num_patches),
-+            image_num_patches=MultiModalFieldConfig.batched("image"),
-+            image_embeds=MultiModalFieldConfig.batched("image"),
-+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
-+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-+                "video", video_num_patches),
-+            video_num_patches=MultiModalFieldConfig.batched("video"),
-+            video_token_id=MultiModalFieldConfig.shared("video", num_videos),
-+        )
-+
-+    def _get_prompt_updates(
-+        self,
-+        mm_items: MultiModalDataItems,
-+        hf_processor_mm_kwargs: Mapping[str, object],
-+        out_mm_kwargs: MultiModalKwargs,
-+    ) -> Sequence[PromptUpdate]:
-+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-+        img_context_token = hf_processor.image_token
-+        start_image_token = hf_processor.start_image_token
-+        end_image_token = hf_processor.end_image_token
-+        video_token = hf_processor.video_token
-+
-+        if "video_num_patches" in out_mm_kwargs:
-+            video_num_patches = out_mm_kwargs["video_num_patches"]
-+            assert isinstance(video_num_patches, torch.Tensor)
-+            video_num_patches = video_num_patches.tolist()
-+        else:
-+            video_num_patches = []
++    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
++        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
++        # seems to avoid vision encoder sections for some models.
++        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
++            return None
++        return quant_config
 +
-+        if "image_num_patches" in out_mm_kwargs:
-+            image_num_patches = out_mm_kwargs["image_num_patches"]
-+            assert isinstance(image_num_patches, torch.Tensor)
-+            image_num_patches = image_num_patches.tolist()
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            if mm_input.ndim == 2:
++                return mm_input
++            if mm_input.ndim != 3:
++                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
++                                 f"Got ndim: {mm_input.ndim} "
++                                 f"(shape={mm_input.shape})")
++            return torch.concat(list(mm_input))
 +        else:
-+            image_num_patches = []
-+
-+        def get_replacement_interns1_image(item_idx: int):
-+            images = mm_items.get_items(
-+                "image", (ImageEmbeddingItems, ImageProcessorItems))
-+
-+            if isinstance(images, ImageEmbeddingItems):
-+                feature_size = images.get_feature_size(item_idx)
-+            else:
-+                num_patches = image_num_patches[item_idx]
-+                feature_size = num_patches * hf_processor.image_seq_length
-+
-+            repl_features = img_context_token * feature_size
-+            repl_full = start_image_token + repl_features + end_image_token
-+            return PromptUpdateDetails.select_text(repl_full,
-+                                                   img_context_token)
-+
-+        def get_replacement_interns1_video(item_idx: int):
-+            num_patches = video_num_patches[item_idx]
-+            repl_features = video_token * hf_processor.image_seq_length
-+            repl_features_with_sep = (start_image_token + repl_features +
-+                                      end_image_token)
-+            # num_patches is equal to num_frames
-+            repl_full = '\n'.join([
-+                f'Frame{i+1}: {repl_features_with_sep}'
-+                for i in range(num_patches)
-+            ])
-+
-+            return PromptUpdateDetails.select_text(repl_full, video_token)
-+
-+        return [
-+            PromptReplacement(
-+                modality="image",
-+                target=img_context_token,
-+                replacement=get_replacement_interns1_image,
-+            ),
-+            PromptReplacement(
-+                modality="video",
-+                target=video_token,
-+                replacement=get_replacement_interns1_video,
-+            ),
-+        ]
-+
-+
-+@MULTIMODAL_REGISTRY.register_processor(
-+    InternS1MultiModalProcessor,
-+    info=InternS1ProcessingInfo,
-+    dummy_inputs=InternS1DummyInputsBuilder)
-+class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
-+                                       SupportsPP, SupportsLoRA):
-+
-+    # To ensure correct weight loading and mapping.
-+    hf_to_vllm_mapper = WeightsMapper(
-+        orig_to_new_prefix={
-+            "lm_head.": "language_model.lm_head.",
-+            "model.language_model.": "language_model.model.",
-+            "model.vision_tower.": "vision_tower.",
-+            "model.multi_modal_projector.": "multi_modal_projector.",
-+        })
-+
-+    @classmethod
-+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
-+        # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
-+        if modality.startswith("image"):
-+            return '<IMG_CONTEXT>'
-+        if modality.startswith("video"):
-+            return "<video>"
-+
-+        raise ValueError("Only image or video modality is supported")
-+
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
-+        super().__init__()
-+        config = vllm_config.model_config.hf_config
-+        quant_config = vllm_config.quant_config
-+        multimodal_config = vllm_config.model_config.multimodal_config
-+
-+        self.config = config
-+        self.multimodal_config = multimodal_config
-+
-+        image_size = config.vision_config.image_size[0]
-+        patch_size = config.vision_config.patch_size[0]
-+        self.patch_size = patch_size
-+        self.num_image_token = int(
-+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
-+        self.downsample_ratio = config.downsample_ratio
-+
-+        self.llm_arch_name = config.text_config.architectures[0]
-+        self.vision_tower = self._init_vision_model(
-+            config,
-+            quant_config=quant_config,
-+            prefix=maybe_prefix(prefix, "vision_tower"),
-+        )
-+
-+        self.language_model = init_vllm_registered_model(
-+            vllm_config=vllm_config,
-+            hf_config=config.text_config,
-+            prefix=maybe_prefix(prefix, "language_model"),
-+        )
-+
-+        self.multi_modal_projector = self._init_mlp1(config)
-+
-+        self.img_context_token_id = None
-+        self.video_context_token_id = None
-+
-+        self.visual_token_mask = None
-+        self.make_empty_intermediate_tensors = (
-+            self.language_model.make_empty_intermediate_tensors)
-+
-+    def _init_vision_model(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig],
-+        *,
-+        prefix: str,
-+    ):
-+        num_hidden_layers = config.vision_config.num_hidden_layers
-+        return InternS1VisionModel(
-+            config.vision_config,
-+            quant_config=quant_config,
-+            num_hidden_layers_override=num_hidden_layers,
-+            prefix=prefix,
-+        )
-+
-+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
-+        return InternS1MultiModalProjector(config)
-+
-+    def pixel_shuffle(self, x, scale_factor=0.5):
-+        n, w, h, c = x.size()
-+        # N, W, H, C --> N, W, H * scale, C // scale
-+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
-+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
-+        x = x.permute(0, 2, 1, 3).contiguous()
-+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-+                   int(c / (scale_factor * scale_factor)))
-+        x = x.permute(0, 2, 1, 3).contiguous()
-+        return x
-+
-+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
-+        vit_embeds = self.vision_tower(pixel_values=pixel_values)
-+        vit_embeds = vit_embeds[:, 1:, :]
-+
-+        h = w = int(vit_embeds.shape[1]**0.5)
-+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-+        vit_embeds = self.pixel_shuffle(vit_embeds,
-+                                        scale_factor=self.downsample_ratio)
-+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-+                                        vit_embeds.shape[-1])
-+
-+        vit_embeds = self.multi_modal_projector(vit_embeds)
-+        return vit_embeds
-+
-+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-+
-+        h, w = self.config.vision_config.image_size
-+        expected_dims = (3, h, w)
-+
-+        def _validate_shape(d: torch.Tensor):
-+            actual_dims = tuple(d.shape)
-+
-+            if actual_dims != expected_dims:
-+                expected_expr = str(expected_dims)
-+                raise ValueError(
-+                    "The expected shape of pixel values per image per batch "
-+                    f" per patch is {expected_expr}. "
-+                    f"You supplied {tuple(d.shape)}.")
-+
-+        for d in data:
-+            _validate_shape(d)
-+
-+        return data
++            return torch.concat(mm_input)
 +
 +    def _parse_and_validate_image_input(
-+            self, **kwargs: object) -> Optional[InternS1ImageInputs]:
++            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
 +        pixel_values = kwargs.pop("pixel_values", None)
-+        image_num_patches = kwargs.pop("image_num_patches", None)
 +        image_embeds = kwargs.pop("image_embeds", None)
++        image_grid_thw = kwargs.pop("image_grid_thw", None)
 +
 +        if pixel_values is None and image_embeds is None:
 +            return None
 +
-+        if image_embeds is not None:
-+            if not isinstance(image_embeds, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image embeddings. "
-+                                 f"Got type: {type(image_embeds)}")
-+
-+            return InternS1ImageEmbeddingInputs(
-+                type="image_embeds",
-+                data=flatten_bn(image_embeds),
-+            )
-+
-+        image_token_id = kwargs["image_token_id"]
-+        assert isinstance(image_token_id, torch.Tensor)
-+        self.img_context_token_id = image_token_id.flatten().unique().item()
-+
 +        if pixel_values is not None:
++            pixel_values = self._validate_and_reshape_mm_tensor(
++                pixel_values, "image pixel values")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
 +            if not isinstance(pixel_values, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of pixel values. "
++                raise ValueError("Incorrect type of image pixel values. "
 +                                 f"Got type: {type(pixel_values)}")
 +
-+            if not isinstance(image_num_patches, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image_num_patches. "
-+                                 f"Got type: {type(image_num_patches)}")
-+
-+            pixel_values = flatten_bn(pixel_values, concat=True)
-+            image_num_patches = flatten_bn(image_num_patches, concat=True)
++            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
++                                              pixel_values=pixel_values,
++                                              image_grid_thw=image_grid_thw)
 +
-+            return InternS1ImagePixelInputs(
-+                type="pixel_values",
-+                pixel_values=self._validate_pixel_values(pixel_values),
-+                num_patches=image_num_patches,
-+            )
++        if image_embeds is not None:
++            image_embeds = self._validate_and_reshape_mm_tensor(
++                image_embeds, "image embeds")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
 +
-+        raise AssertionError("This line should be unreachable.")
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++            return Qwen2_5_VLImageEmbeddingInputs(
++                type="image_embeds",
++                image_embeds=image_embeds,
++                image_grid_thw=image_grid_thw)
 +
 +    def _parse_and_validate_video_input(
-+            self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
-+        pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
-+        video_num_patches = kwargs.pop("video_num_patches", None)
++            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
++        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
 +        video_embeds = kwargs.pop("video_embeds", None)
++        video_grid_thw = kwargs.pop("video_grid_thw", None)
++        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
 +
-+        if pixel_values_flat_video is None and video_embeds is None:
++        if pixel_values_videos is None and video_embeds is None:
 +            return None
 +
-+        if video_embeds is not None:
-+            if not isinstance(video_embeds, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of video embeddings. "
-+                                 f"Got type: {type(video_embeds)}")
++        if pixel_values_videos is not None:
++            pixel_values_videos = self._validate_and_reshape_mm_tensor(
++                pixel_values_videos, "video pixel values")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
 +
-+            return InternS1ImageEmbeddingInputs(
-+                type="video_embeds",
-+                data=flatten_bn(video_embeds),
++            return Qwen2_5_VLVideoPixelInputs(
++                type="pixel_values_videos",
++                pixel_values_videos=pixel_values_videos,
++                video_grid_thw=video_grid_thw,
++                second_per_grid_ts=second_per_grid_ts,
 +            )
 +
-+        video_token_id = kwargs["video_token_id"]
-+        assert isinstance(video_token_id, torch.Tensor)
-+        self.video_context_token_id = video_token_id.flatten().unique().item()
-+
-+        if pixel_values_flat_video is not None:
-+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of pixel values. "
-+                                 f"Got type: {type(pixel_values_flat_video)}")
-+
-+            if not isinstance(video_num_patches, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image_num_patches. "
-+                                 f"Got type: {type(video_num_patches)}")
++        if video_embeds is not None:
++            video_embeds = self._validate_and_reshape_mm_tensor(
++                video_embeds, "video embeds")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
 +
-+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-+                                                 concat=True)
-+            video_num_patches = flatten_bn(video_num_patches, concat=True)
++            if not isinstance(video_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of video embeddings. "
++                                 f"Got type: {type(video_embeds)}")
++            return Qwen2_5_VLVideoEmbeddingInputs(
++                type="video_embeds",
++                video_embeds=video_embeds,
++                video_grid_thw=video_grid_thw)
 +
-+            return InternS1VideoPixelInputs(
-+                type="pixel_values_videos",
-+                pixel_values=self._validate_pixel_values(
-+                    pixel_values_flat_video),
-+                num_patches=video_num_patches,
-+            )
++    def _process_image_input(
++            self,
++            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
 +
-+        raise AssertionError("This line should be unreachable.")
++        grid_thw = image_input["image_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
 +
-+    def _process_image_input(
-+        self,
-+        image_input: Union[InternS1ImageInputs, InternS1VideoPixelInputs],
-+    ) -> tuple[torch.Tensor, ...]:
 +        if image_input["type"] == "image_embeds":
-+            return image_input["data"]
++            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
++        else:
++            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
++            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 +
-+        assert self.vision_tower is not None
++        # Split concatenated embeddings for each image item.
++        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
++        merge_size = self.visual.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
++        return image_embeds.split(sizes)
 +
-+        image_embeds = self.extract_feature(image_input["pixel_values"])
++    def _process_video_input(
++            self,
++            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
 +
-+        num_patches = image_input["num_patches"]
++        grid_thw = video_input["video_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
 +
-+        # Only one image in the current batch
-+        if len(num_patches) == 1:
-+            return (image_embeds.view(-1,
-+                                      self.config.text_config.hidden_size), )
++        if video_input["type"] == "video_embeds":
++            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
++        else:
++            pixel_values_videos = video_input["pixel_values_videos"].type(
++                self.visual.dtype)
++            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 +
-+        # NOTE: Image embeddings are split into separate tensors for each image
-+        # by the size of each embedding.
-+        feature_size = image_embeds.shape[1]
-+        image_embeds = image_embeds.view(-1,
-+                                         self.config.text_config.hidden_size)
-+        image_feature_sizes = [
-+            num_patches * feature_size for num_patches in num_patches
-+        ]
-+        return image_embeds.split(image_feature_sizes)
++        # Split concatenated embeddings for each video item.
++        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
++        merge_size = self.visual.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
++        return video_embeds.split(sizes)
 +
 +    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-+        modalities = {}
-+
-+        # Preserve the order of modalities if there are multiple of them
-+        # from the order of kwargs.
++        mm_input_by_modality = {}
 +        for input_key in kwargs:
-+            if input_key in ("pixel_values",
-+                             "image_embeds") and "images" not in modalities:
-+                modalities["images"] = self._parse_and_validate_image_input(
-+                    **kwargs)
-+            if input_key in (
-+                    "pixel_values_videos", ) and "videos" not in modalities:
-+                modalities["videos"] = self._parse_and_validate_video_input(
-+                    **kwargs)
-+
-+        return modalities
-+
-+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
-+        self.visual_token_mask = None
++            if input_key in ("pixel_values", "image_embeds"
++                             ) and "image" not in mm_input_by_modality:
++                mm_input_by_modality[
++                    "image"] = self._parse_and_validate_image_input(**kwargs)
++            if input_key in ("pixel_values_videos", "video_embeds"
++                             ) and "video" not in mm_input_by_modality:
++                mm_input_by_modality[
++                    "video"] = self._parse_and_validate_video_input(**kwargs)
++        return mm_input_by_modality
 +
 +    def get_language_model(self) -> torch.nn.Module:
 +        return self.language_model
 +
-+    def get_multimodal_embeddings(self,
-+                                  **kwargs: object) -> MultiModalEmbeddings:
++    def get_multimodal_embeddings(
++            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 +
-+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-+        if not modalities:
-+            return []
++        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
++            **kwargs)
++        if not mm_input_by_modality:
++            return None
 +
 +        # The result multimodal_embeddings is tuple of tensors, with each
 +        # tensor correspoending to a multimodal data item (image or video).
@@ -14135,39 +7002,148 @@ index 000000000..ab21cbe91
 +
 +        # NOTE: It is important to iterate over the keys in this dictionary
 +        # to preserve the order of the modalities.
-+        for modality in modalities:
-+            if modality == "images":
-+                image_input = modalities["images"]
-+                vision_embeddings = self._process_image_input(image_input)
++        for modality in mm_input_by_modality:
++            multimodal_input = mm_input_by_modality[modality]
++            if modality == "image":
++                vision_embeddings = self._process_image_input(multimodal_input)
 +                multimodal_embeddings += vision_embeddings
-+            if modality == "videos":
-+                video_input = modalities["videos"]
-+                video_embeddings = self._process_image_input(video_input)
++            if modality == "video":
++                video_embeddings = self._process_video_input(multimodal_input)
 +                multimodal_embeddings += video_embeddings
-+
 +        return multimodal_embeddings
 +
++    def _compute_deepstack_embeds(
++            self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor,
++            multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor:
++        visual_lens = [
++            x.shape[0] if isinstance(x, torch.Tensor) else len(x)
++            for x in multimodal_embeddings
++        ]
++        multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0)
++
++        visual_dim = multimodal_embeddings_cat.shape[-1] // (
++            self.deepstack_num_level + 1)
++
++        main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level
++        multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split(  # noqa:E501
++            multimodal_embeddings_cat, [main_dim, multi_dim],
++            dim=-1)
++
++        multimodal_embeddings = torch.split(multimodal_embeddings_main,
++                                            visual_lens,
++                                            dim=0)
++        multimodal_embeddings_multiscale = torch.split(
++            multimodal_embeddings_multiscale, visual_lens, dim=0)
++
++        deepstack_input_embeds = inputs_embeds.new_zeros(
++            inputs_embeds.size(0),
++            self.deepstack_num_level * inputs_embeds.size(1))
++
++        deepstack_input_embeds = merge_multimodal_embeddings(
++            input_ids,
++            deepstack_input_embeds,
++            multimodal_embeddings_multiscale,
++            placeholder_token_id=[
++                self.config.image_token_id, self.config.video_token_id
++            ],
++        )
++        deepstack_input_embeds = deepstack_input_embeds.view(
++            inputs_embeds.shape[0], self.deepstack_num_level,
++            visual_dim).contiguous()
++        deepstack_input_embeds = deepstack_input_embeds.permute(
++            1, 0, 2).contiguous()
++        return deepstack_input_embeds, multimodal_embeddings
++
 +    def get_input_embeddings(
 +        self,
 +        input_ids: torch.Tensor,
 +        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
 +    ) -> torch.Tensor:
++        deepstack_input_embeds = None
 +        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-+        if multimodal_embeddings is not None \
-+            and len(multimodal_embeddings) != 0:
-+            context_token_ids = [
-+                token_id for token_id in (self.img_context_token_id,
-+                                          self.video_context_token_id)
-+                if token_id is not None
-+            ]
-+            assert len(context_token_ids) >= 1
-+            self._set_visual_token_mask(input_ids)
++        if multimodal_embeddings is not None and self.use_deepstack:
++            deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds(  # noqa:E501
++                input_ids, inputs_embeds, multimodal_embeddings)
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                [self.config.image_token_id, self.config.video_token_id])
++
++        if self.use_deepstack:
++            if deepstack_input_embeds is None:
++                deepstack_input_embeds = torch.zeros_like(
++                    inputs_embeds).unsqueeze(0).repeat(
++                        self.deepstack_num_level, 1, 1).contiguous()
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
++
++        return inputs_embeds
++
++    def get_input_embeddings_v0(
++        self,
++        input_ids: torch.Tensor,
++        image_input: Optional[Qwen2_5_VLImageInputs] = None,
++        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.get_input_embeddings(input_ids)
++
++        if self.use_deepstack:
++            visual_dim = inputs_embeds.shape[-1]
++            deepstack_input_embeds = None
++            if image_input is not None or video_input is not None:
++                deepstack_input_embeds = torch.zeros_like(
++                    inputs_embeds).unsqueeze(1).repeat(
++                        1, self.deepstack_num_level, 1).flatten(1)
++
++        if image_input is not None:
++            image_embeds = self._process_image_input(image_input)
++            if self.use_deepstack:
++                image_embeds = torch.cat(image_embeds)
++
++                image_embeds, image_embeds_multiscale = image_embeds.split(
++                    [visual_dim, visual_dim * self.deepstack_num_level],
++                    dim=-1)
++
++                deepstack_input_embeds = merge_multimodal_embeddings(
++                    input_ids,
++                    deepstack_input_embeds,
++                    image_embeds_multiscale,
++                    placeholder_token_id=self.config.image_token_id,
++                )
++
 +            inputs_embeds = merge_multimodal_embeddings(
 +                input_ids,
 +                inputs_embeds,
-+                multimodal_embeddings,
-+                context_token_ids,
++                image_embeds,
++                placeholder_token_id=self.config.image_token_id,
++            )
++
++        if video_input is not None:
++            video_embeds = self._process_video_input(video_input)
++            if self.use_deepstack:
++                video_embeds = torch.cat(video_embeds)
++
++                video_embeds, video_embeds_multiscale = video_embeds.split(
++                    [visual_dim, visual_dim * self.deepstack_num_level],
++                    dim=-1)
++
++                deepstack_input_embeds = merge_multimodal_embeddings(
++                    input_ids,
++                    deepstack_input_embeds,
++                    video_embeds_multiscale,
++                    placeholder_token_id=self.config.video_token_id,
++                )
++
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids,
++                inputs_embeds,
++                video_embeds,
++                placeholder_token_id=self.config.video_token_id,
 +            )
++
++        if self.use_deepstack and deepstack_input_embeds is not None:
++            deepstack_input_embeds = deepstack_input_embeds.view(
++                inputs_embeds.shape[0], self.deepstack_num_level,
++                visual_dim).permute(1, 0, 2).contiguous()
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
 +        return inputs_embeds
 +
 +    def forward(
@@ -14177,1360 +7153,106 @@ index 000000000..ab21cbe91
 +        intermediate_tensors: Optional[IntermediateTensors] = None,
 +        inputs_embeds: Optional[torch.Tensor] = None,
 +        **kwargs: object,
-+    ) -> IntermediateTensors:
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for Qwen3VL.
 +
-+        if intermediate_tensors is not None:
-+            input_ids = None
-+            inputs_embeds = None
-+
-+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-+        # condition is for v0 compatibility.
-+        elif inputs_embeds is None:
-+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-+            inputs_embeds = self.get_input_embeddings(input_ids,
-+                                                      vision_embeddings)
-+            input_ids = None
-+
-+        forward_kwargs = {
-+            "input_ids": input_ids,
-+            "positions": positions,
-+            "intermediate_tensors": intermediate_tensors,
-+            "inputs_embeds": inputs_embeds,
-+        }
-+
-+        hidden_states = self.language_model.model(**forward_kwargs)
-+        return hidden_states
-+
-+    def compute_logits(
-+        self,
-+        hidden_states: torch.Tensor,
-+        sampling_metadata: SamplingMetadata,
-+    ) -> Optional[torch.Tensor]:
-+        return self.language_model.compute_logits(hidden_states,
-+                                                  sampling_metadata)
-+
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self)
-+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-+
-+    def get_mm_mapping(self) -> MultiModelKeys:
-+        """
-+        Get the module prefix in multimodal models
-+        """
-+        return MultiModelKeys.from_string_field(
-+            language_model="language_model",
-+            connector="multi_modal_projector",
-+            tower_model="vision_tower")
-diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
-index f8b9ea2c5..2a9366d87 100644
---- a/vllm/model_executor/models/internvl.py
-+++ b/vllm/model_executor/models/internvl.py
-@@ -9,7 +9,7 @@
- # --------------------------------------------------------
- from abc import ABC, abstractmethod
- from collections.abc import Iterable, Mapping, Sequence
--from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
-+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
- 
- import numpy.typing as npt
- import torch
-@@ -37,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
- from vllm.multimodal.profiling import BaseDummyInputsBuilder
- from vllm.sequence import IntermediateTensors
- from vllm.transformers_utils.tokenizer import AnyTokenizer
-+from vllm.utils.tensor_schema import TensorSchema, TensorShape
- 
- from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                          SupportsMultiModal, SupportsPP)
-@@ -51,54 +52,60 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
- IMAGENET_STD = (0.229, 0.224, 0.225)
- 
- 
--class InternVLImagePixelInputs(TypedDict):
--    type: Literal["pixel_values"]
--    pixel_values_flat: torch.Tensor
-+class InternVLImagePixelInputs(TensorSchema):
-     """
--    Shape:
--    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
-+    Dimensions:
-+        - bn: Batch size * number of images
-+        - bnp: Batch size * number of images * (1 + num_patches)
-+        - c: Number of channels (3)
-+        - h: Height of each image patch
-+        - w: Width of each image patch
-     """
-+    type: Literal["pixel_values"]
-+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
-+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
- 
--    num_patches: torch.Tensor
--    """Shape: `(batch_size * num_images)`"""
--
--
--class InternVLImageEmbeddingInputs(TypedDict):
--    type: Literal["image_embeds"]
--    data: Union[torch.Tensor, list[torch.Tensor]]
--    """ 
--    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
--    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
- 
--    `hidden_size` must match the hidden size of language model backbone.
-+class InternVLImageEmbeddingInputs(TensorSchema):
-     """
-+    Dimensions:
-+        - n: Number of images
-+        - f: Total image feature size
-+        - h: Hidden size (must match the hidden size of language model backbone)
-+    """
-+    type: Literal["image_embeds"]
-+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-+                    TensorShape("n", "f", "h")]
- 
- 
- InternVLImageInputs = Union[InternVLImagePixelInputs,
-                             InternVLImageEmbeddingInputs]
- 
- 
--class InternVLVideoPixelInputs(TypedDict):
--    type: Literal["pixel_values_videos"]
--    pixel_values_flat: torch.Tensor
-+class InternVLVideoPixelInputs(TensorSchema):
-     """
--    Shape:
--    `(batch_size * num_video * num_frames, num_channels, height, width)`
-+    Dimensions:
-+        - bvf: Batch size * number of videos * num_frames
-+        - bn: Batch size * number of images
-+        - c: Number of channels (3)
-+        - h: Height of each video frame
-+        - w: Width of each video frame
-     """
-+    type: Literal["pixel_values_videos"]
-+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
-+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
- 
--    num_patches: torch.Tensor
--    """Shape: `(batch_size * num_images)`"""
--
--
--class InternVLVideoEmbeddingInputs(TypedDict):
--    type: Literal["video_embeds"]
--    data: Union[torch.Tensor, list[torch.Tensor]]
--    """ 
--    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
--    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
- 
--    `hidden_size` must match the hidden size of language model backbone.
-+class InternVLVideoEmbeddingInputs(TensorSchema):
-+    """
-+    Dimensions:
-+        - n: Number of videos
-+        - f: Total video feature size
-+        - h: Hidden size (must match the hidden size of language model backbone)
-     """
-+    type: Literal["video_embeds"]
-+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-+                    TensorShape("n", "f", "h")]
- 
- 
- InternVLVideoInputs = Union[InternVLVideoPixelInputs,
-@@ -658,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
-     """Basic image-only ProcessingInfo for InternVL-style models."""
- 
-     @abstractmethod
--    def get_hf_processor(
--        self,
--        *,
--        min_dynamic_patch: Optional[int] = None,
--        max_dynamic_patch: Optional[int] = None,
--        dynamic_image_size: Optional[bool] = None,
--        **kwargs: object,
--    ) -> BaseInternVLProcessor:
-+    def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
-         raise NotImplementedError
- 
-     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-@@ -854,9 +854,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
- 
-     def get_video_token(self) -> Optional[str]:
-         text_model_type = self.get_hf_config().get_text_config().model_type
--        if text_model_type == "qwen2":
--            return "<|video_pad|>"
--        return None
-+        video_token_map = {
-+            "qwen2": "<|video_pad|>",
-+            "qwen3": "<|video_pad|>",
-+            "qwen3_moe": "<|video_pad|>",
-+            "gpt_oss": "<|reserved_200000|>",
-+        }
-+        return video_token_map.get(text_model_type)
- 
-     def get_num_frames_with_most_features(
-         self,
-@@ -875,27 +879,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
- 
-         return max(max_frames_per_video, 1)
- 
--    def get_hf_processor(
--        self,
--        *,
--        min_dynamic_patch: Optional[int] = None,
--        max_dynamic_patch: Optional[int] = None,
--        dynamic_image_size: Optional[bool] = None,
--        **kwargs: object,
--    ) -> InternVLProcessor:
--        if min_dynamic_patch is not None:
--            kwargs["min_dynamic_patch"] = min_dynamic_patch
--        if max_dynamic_patch is not None:
--            kwargs["max_dynamic_patch"] = max_dynamic_patch
--        if dynamic_image_size is not None:
--            kwargs["dynamic_image_size"] = dynamic_image_size
--
--        kwargs["video_token"] = self.get_video_token()
--
-+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-         return self.ctx.init_processor(
-             InternVLProcessor,
-             config=self.get_hf_config(),
-             tokenizer=self.get_tokenizer(),
-+            video_token=self.get_video_token(),
-             **kwargs,
-         )
- 
-@@ -1151,26 +1140,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-         vit_embeds = self.mlp1(vit_embeds)
-         return vit_embeds
- 
--    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
--
--        h = w = self.config.vision_config.image_size
--        expected_dims = (3, h, w)
--
--        def _validate_shape(d: torch.Tensor):
--            actual_dims = tuple(d.shape)
--
--            if actual_dims != expected_dims:
--                expected_expr = str(expected_dims)
--                raise ValueError(
--                    "The expected shape of pixel values per image per batch "
--                    f" per patch is {expected_expr}. "
--                    f"You supplied {tuple(d.shape)}.")
--
--        for d in data:
--            _validate_shape(d)
--
--        return data
--
-     def _parse_and_validate_image_input(
-             self, **kwargs: object) -> Optional[InternVLImageInputs]:
-         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
-@@ -1205,12 +1174,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
- 
-             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-             image_num_patches = flatten_bn(image_num_patches, concat=True)
-+            expected_h = expected_w = self.config.vision_config.image_size
-+            resolve_bindings = {"h": expected_h, "w": expected_w}
- 
-             return InternVLImagePixelInputs(
-                 type="pixel_values",
--                pixel_values_flat=self._validate_pixel_values(
--                    pixel_values_flat),
-+                pixel_values_flat=pixel_values_flat,
-                 num_patches=image_num_patches,
-+                resolve_bindings=resolve_bindings,
-             )
- 
-         raise AssertionError("This line should be unreachable.")
-@@ -1225,11 +1196,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-             return None
- 
-         if video_embeds is not None:
--            if not isinstance(video_embeds, (torch.Tensor, list)):
--                raise ValueError("Incorrect type of video embeddings. "
--                                 f"Got type: {type(video_embeds)}")
--
--            return InternVLImageEmbeddingInputs(
-+            return InternVLVideoEmbeddingInputs(
-                 type="video_embeds",
-                 data=flatten_bn(video_embeds),
-             )
-@@ -1250,12 +1217,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-             pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-                                                  concat=True)
-             video_num_patches = flatten_bn(video_num_patches, concat=True)
-+            expected_h = expected_w = self.config.vision_config.image_size
-+            resolve_bindings = {"h": expected_h, "w": expected_w}
- 
-             return InternVLVideoPixelInputs(
-                 type="pixel_values_videos",
--                pixel_values_flat=self._validate_pixel_values(
--                    pixel_values_flat_video),
-+                pixel_values_flat=pixel_values_flat_video,
-                 num_patches=video_num_patches,
-+                resolve_bindings=resolve_bindings,
-             )
- 
-         raise AssertionError("This line should be unreachable.")
-@@ -1322,7 +1291,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-         if not modalities:
-             return []
--            return None
- 
-         # The result multimodal_embeddings is tuple of tensors, with each
-         # tensor correspoending to a multimodal data item (image or video).
-diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
-index 48ec611df..09e6d8af7 100644
---- a/vllm/model_executor/models/llama.py
-+++ b/vllm/model_executor/models/llama.py
-@@ -437,6 +437,9 @@ class LlamaModel(nn.Module):
-             if "scale" in name:
-                 # Remapping the name of FP8 kv-scale.
-                 name = maybe_remap_kv_scale_name(name, params_dict)
-+                # temp fix for unit scale INC model, will can be removed
-+                if "proj.scale" in name and not "scales" in name:
-+                    name = name.replace("scale", "weight_scale")
-                 if name is None:
-                     continue
-             for param_name, weight_name, shard_id in stacked_params_mapping:
-diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
-index 70f2d4a64..816ea00d2 100644
---- a/vllm/model_executor/models/minicpmv.py
-+++ b/vllm/model_executor/models/minicpmv.py
-@@ -27,17 +27,21 @@ import math
- from collections import defaultdict
- from collections.abc import Iterable, Mapping, Sequence
- from functools import partial
-+from itertools import chain
- from typing import Any, Callable, Literal, Optional, TypedDict, Union
- 
- import numpy as np
- import torch
- import torch.types
- from torch import nn
-+from torch.nn.init import trunc_normal_
- from transformers import BatchFeature, PretrainedConfig
- from typing_extensions import TypeVar
- 
- from vllm.config import VllmConfig
- from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.layers.quantization.awq import AWQConfig
-+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
- from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
-                                                   get_2d_sincos_pos_embed)
- from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-@@ -45,6 +49,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
- from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
- from vllm.model_executor.models.module_mapping import MultiModelKeys
- from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
- from vllm.model_executor.sampling_metadata import SamplingMetadata
- from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
- from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-@@ -203,6 +208,187 @@ class Resampler2_5(BaseResampler):
-         return x
- 
- 
-+class Resampler4_5(Resampler2_5):
-+
-+    def __init__(self,
-+                 num_queries: int,
-+                 embed_dim: int,
-+                 num_heads: int,
-+                 kv_dim: Optional[int] = None,
-+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-+                 max_size: tuple[int, int] = (70, 70),
-+                 max_temporal_size: int = 36000,
-+                 quant_config: Optional[QuantizationConfig] = None,
-+                 prefix: str = "") -> None:
-+        super().__init__(num_queries,
-+                         embed_dim,
-+                         num_heads,
-+                         kv_dim,
-+                         norm_layer,
-+                         max_size,
-+                         quant_config=quant_config,
-+                         prefix=prefix)
-+
-+        trunc_normal_(self.query, std=.02)
-+        self.max_temporal_size = max_temporal_size
-+        self._set_temporal_pos_cache(self.max_temporal_size)
-+        self.apply(self._init_weights)
-+
-+    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
-+                                                   pos: np.ndarray):
-+        """
-+        embed_dim: output dimension for each position
-+        pos: a list of positions to be encoded: size (M,)
-+        out: (M, D)
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            positions: Flattened (concatenated) position ids corresponding to a
++                batch.
++                **NOTE**: If mrope is enabled (default setting for Qwen3VL
++                opensource models), the shape will be `(3, seq_len)`,
++                otherwise it will be `(seq_len,).
++            pixel_values: Pixel values to be fed to a model.
++                `None` if no images are passed.
++            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
++                `None` if no images are passed.
++            pixel_values_videos: Pixel values of videos to be fed to a model.
++                `None` if no videos are passed.
++            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
++                `None` if no videos are passed.
 +        """
-+        assert embed_dim % 2 == 0
-+        omega = np.arange(embed_dim // 2, dtype=np.float32)
-+        omega /= embed_dim / 2.
-+        omega = 1. / 10000**omega  # (D/2,)
-+
-+        pos = pos.reshape(-1)  # (M,)
-+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-+
-+        emb_sin = np.sin(out)  # (M, D/2)
-+        emb_cos = np.cos(out)  # (M, D/2)
-+
-+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-+        return emb
-+
-+    def _set_temporal_pos_cache(self,
-+                                max_temporal_size: int,
-+                                device: torch.types.Device = "cpu") -> None:
-+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
-+        pos_embed = torch.from_numpy(
-+            self.get_1d_sincos_pos_embed_from_temporal_size(
-+                self.embed_dim, temporal_size)).float().to(device)
-+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
-+
-+    def _adjust_temporal_pos_cache(self,
-+                                   max_temporal_size: int,
-+                                   device: torch.types.Device = "cpu"):
-+        if max_temporal_size > self.max_temporal_size:
-+            self.max_temporal_size = max_temporal_size
-+            self._set_temporal_pos_cache(self.max_temporal_size, device)
-+
-+    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
-+        if isinstance(m, nn.Linear):
-+            trunc_normal_(m.weight, std=.02)
-+            if isinstance(m, nn.Linear) and m.bias is not None:
-+                nn.init.constant_(m.bias, 0)
-+        elif isinstance(m, nn.LayerNorm):
-+            nn.init.constant_(m.bias, 0)
-+            nn.init.constant_(m.weight, 1.0)
-+
-+    def forward(
-+        self,
-+        x: torch.Tensor,
-+        tgt_sizes: torch.Tensor,
-+        # temporal_ids for high refresh rate videos
-+        temporal_ids=None
-+    ) -> torch.Tensor:
-+        assert x.shape[0] == tgt_sizes.shape[0]
-+        bs = x.shape[0]
-+
-+        device = x.device
-+        dtype = x.dtype
-+
-+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
-+
-+        self._adjust_pos_cache(tgt_sizes, device=device)
-+
-+        temporal_pos_emb = False
-+        temporal_ids_flatten = None
-+        if temporal_ids is not None:
-+            # example: [[-1], [-1], [2, 6, 9]]
-+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
-+            max_temporal_size = max(temporal_ids_flatten, default=0)
-+            if max_temporal_size > -1:
-+                temporal_pos_emb = True
-+            if max_temporal_size > self.max_temporal_size:
-+                self._adjust_temporal_pos_cache(max_temporal_size, device)
-+
-+        max_patch_len = patch_len.max().item()
-+        assert isinstance(max_patch_len, int)
-+
-+        key_padding_mask = torch.zeros((bs, max_patch_len),
-+                                       dtype=torch.bool,
-+                                       device=device)
-+
-+        x, _ = self.kv_proj(x)  # B * L * D
-+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
-+        q = self.ln_q(self.query)  # Q * D
-+
-+        pos_embed_2d = []
-+        pos_embed_temporal = []
-+        for i in range(bs):
-+            tgt_h, tgt_w = tgt_sizes[i]
-+            if temporal_pos_emb:
-+                if temporal_ids_flatten[i] == -1:
-+                    pos_embed_temporal.append(
-+                        torch.zeros(self.embed_dim, dtype=dtype,
-+                                    device=device))
-+                else:
-+                    pos_embed_temporal.append(self.temporal_pos_embed[
-+                        temporal_ids_flatten[i]].to(dtype))  # D
-+
-+            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
-+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
-+            key_padding_mask[i, patch_len[i]:] = True
-+
-+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
-+            pos_embed_2d, batch_first=True,
-+            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
-+
-+        k = x
-+        v = x + pos_embed_2d
-+        if pos_embed_temporal:
-+            k += torch.stack(pos_embed_temporal, dim=0)
-+            bs = len(temporal_ids)
-+            merge_k = []
-+            merge_v = []
-+            merge_key_padding_mask = []
-+
-+            start = 0
-+            for tp in temporal_ids:
-+                end = start + len(tp)
-+                # L * (end-start) * D -> (end-start) * L * D
-+                # -> 1 * L*(end-start) * D
-+                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
-+                    -1, self.embed_dim))
-+                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
-+                    -1, self.embed_dim))
-+                merge_key_padding_mask.append(
-+                    key_padding_mask[start:end, :].reshape(-1, 1))
-+
-+                start = end
-+
-+            k = torch.nn.utils.rnn.pad_sequence(merge_k,
-+                                                batch_first=True,
-+                                                padding_value=0.0).permute(
-+                                                    1, 0, 2)  # L*(end-start)
-+            v = torch.nn.utils.rnn.pad_sequence(merge_v,
-+                                                batch_first=True,
-+                                                padding_value=0.0).permute(
-+                                                    1, 0, 2)  # L*(end-start)
-+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
-+                merge_key_padding_mask, batch_first=True,
-+                padding_value=True).squeeze(-1)
-+
-+        out = self.attn(
-+            self._repeat(q, bs),  # Q * B * D
-+            k,  # L * B * D +  L * B * D
-+            v,
-+            key_padding_mask=key_padding_mask,
-+        )[0]
-+        #  out: Q * B * D
-+        x = out.permute(1, 0, 2)  # B * Q * D
-+
-+        x = self.ln_post(x)
-+        x = x @ self.proj
-+        return x
-+
-+
- def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
-     version_float = getattr(config, "version", None)
- 
-@@ -331,17 +517,15 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
- 
-         return hf_processor
- 
--    def get_image_processor(self):
--        hf_processor = self.get_hf_processor()
--        image_processor = hf_processor.image_processor  # type: ignore
--        return image_processor
-+    def get_image_processor(self, **kwargs: object):
-+        return self.get_hf_processor(**kwargs).image_processor
- 
-     def get_model_version(self):
-         return get_version_by_config(self.get_hf_config())
- 
-     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-         mm_limits = {"image": None}
--        if self.get_model_version() == (2, 6):
-+        if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
-             mm_limits["video"] = None
- 
-         return mm_limits
-@@ -622,7 +806,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
-         out_keys: set[str],
-     ) -> dict[str, NestedTensors]:
-         # This processor supports zipping prompt and mm_data together
--        if self.info.get_model_version() == (2, 6):
-+        if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
-             inputs = super()._call_hf_processor(
-                 prompt=prompts,  # type: ignore
-                 mm_data=mm_data,
-@@ -681,10 +865,18 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
-         hf_processor_mm_kwargs: Mapping[str, object],
-         out_mm_kwargs: MultiModalKwargs,
-     ) -> Sequence[PromptUpdate]:
--        placeholder = {
--            "image": self.info.image_pattern,
--            "video": self.info.video_pattern,
--        }
-+        placeholders = [("image", self.info.image_pattern),
-+                        ("video", self.info.video_pattern)]
-+
-+        # hard code for inconsistency of encode-decode image_pattern
-+        additional_placeholders = []
-+        tokenizer = self.info.get_tokenizer()
-+        for modality, pattern in placeholders:
-+            sub_pattern = tokenizer.decode(
-+                tokenizer.encode(pattern, add_special_tokens=False))
-+            if sub_pattern != pattern:
-+                additional_placeholders.append((modality, sub_pattern))
-+        placeholders += additional_placeholders
- 
-         def get_image_replacement(item_idx: int):
-             images = mm_items.get_items(
-@@ -716,9 +908,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
- 
-         return [
-             PromptReplacement(modality=modality,
--                              target=placeholder[modality],
-+                              target=pattern,
-                               replacement=get_replacement[modality])
--            for modality in ("image", "video")
-+            for modality, pattern in placeholders
-         ]
- 
-     def _get_mm_fields_config(
-@@ -1264,11 +1456,234 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
- 
-         return self.resampler(vision_embedding, tgt_sizes)
- 
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
-+
-+class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
-+
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        assert self.version == (4, 0)
-+
-+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-+            return None
-+        return quant_config
-+
-+    def init_llm(
-+        self,
-+        vllm_config: VllmConfig,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
-+
-+    def init_vision_module(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        model = Idefics2VisionTransformer(config.vision_config,
-+                                          quant_config=quant_config,
-+                                          prefix=prefix)
-+        if self.config.drop_vision_last_layer:
-+            model.encoder.layers = model.encoder.layers[:-1]
-+        return model
-+
-+    def init_resampler(
-+        self,
-+        embed_dim: int,
-+        vision_dim: int,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        with set_default_torch_dtype(torch.float16):
-+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-+            resampler = Resampler2_5(num_queries=self.config.query_num,
-+                                     embed_dim=embed_dim,
-+                                     num_heads=embed_dim // 128,
-+                                     kv_dim=vision_dim,
-+                                     quant_config=quant_config,
-+                                     prefix=prefix)
-+
-+        return resampler.to(device=current_platform.device_type,
-+                            dtype=torch.get_default_dtype())
-+
-+    def get_vision_hidden_states(
-+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
-+        pixel_values = data["pixel_values"]
-+        tgt_sizes = data["tgt_sizes"]
-+
-+        B = len(pixel_values)
-+        P = pixel_values[0].shape[-2]
-+        L = max(item.shape[-1] for item in pixel_values)
-+        device = pixel_values[0].device
-+        dtype = pixel_values[0].dtype
-+
-+        all_pixel_values = torch.zeros((B, 3, P, L),
-+                                       dtype=dtype,
-+                                       device=device)
-+        for i, pixel_values_item in enumerate(pixel_values):
-+            L_item = pixel_values_item.shape[-1]
-+            all_pixel_values[i, ..., :L_item] = pixel_values_item
-+
-+        num_patches = tgt_sizes.prod(-1)
-+        max_patches = num_patches.max().item()
-+        assert isinstance(max_patches, int)
-+
-+        patch_attn_mask = torch.zeros((B, max_patches),
-+                                      dtype=torch.bool,
-+                                      device=device)
-+        for i, num_patches_item in enumerate(num_patches):
-+            patch_attn_mask[i, :num_patches_item] = True
-+
-+        vision_embedding = self.vpm(
-+            all_pixel_values,
-+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
-+            tgt_sizes=tgt_sizes,
-+        )
-+
-+        return self.resampler(vision_embedding, tgt_sizes)
-+
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
-+
-+class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
-+
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        assert self.version == (4, 5)
-+
-+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-+            return None
-+        return quant_config
-+
-+    def init_llm(
-+        self,
-+        vllm_config: VllmConfig,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
-+
-+    def init_vision_module(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        model = Idefics2VisionTransformer(config.vision_config,
-+                                          quant_config=quant_config,
-+                                          prefix=prefix)
-+        if self.config.drop_vision_last_layer:
-+            model.encoder.layers = model.encoder.layers[:-1]
-+        return model
-+
-+    def init_resampler(
-+        self,
-+        embed_dim: int,
-+        vision_dim: int,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        with set_default_torch_dtype(torch.float16):
-+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-+            resampler = Resampler4_5(num_queries=self.config.query_num,
-+                                     embed_dim=embed_dim,
-+                                     num_heads=embed_dim // 128,
-+                                     kv_dim=vision_dim,
-+                                     quant_config=quant_config,
-+                                     prefix=prefix)
-+
-+        return resampler.to(device=current_platform.device_type,
-+                            dtype=torch.get_default_dtype())
-+
-+    def get_vision_hidden_states(
-+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
-+        pixel_values = data["pixel_values"]
-+        tgt_sizes = data["tgt_sizes"]
-+        temporal_ids = data.get('temporal_ids', None)
-+
-+        B = len(pixel_values)
-+        P = pixel_values[0].shape[-2]
-+        L = max(item.shape[-1] for item in pixel_values)
-+        device = pixel_values[0].device
-+        dtype = pixel_values[0].dtype
-+
-+        all_pixel_values = torch.zeros((B, 3, P, L),
-+                                       dtype=dtype,
-+                                       device=device)
-+        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
-+            temporal_ids)
-+        for i, pixel_values_item in enumerate(pixel_values):
-+            L_item = pixel_values_item.shape[-1]
-+            all_pixel_values[i, ..., :L_item] = pixel_values_item
-+
-+        num_patches = tgt_sizes.prod(-1)
-+        max_patches = num_patches.max().item()
-+        assert isinstance(max_patches, int)
-+
-+        patch_attn_mask = torch.zeros((B, max_patches),
-+                                      dtype=torch.bool,
-+                                      device=device)
-+        for i, num_patches_item in enumerate(num_patches):
-+            patch_attn_mask[i, :num_patches_item] = True
-+
-+        vision_embedding = self.vpm(
-+            all_pixel_values,
-+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
-+            tgt_sizes=tgt_sizes,
-+        )
-+
-+        return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
-+
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
- 
- _SUPPORT_VERSION = {
-     (2, 0): MiniCPMV2_0,
-     (2, 5): MiniCPMV2_5,
-     (2, 6): MiniCPMV2_6,
-+    (4, 0): MiniCPMV4_0,
-+    (4, 5): MiniCPMV4_5,
- }
- 
- 
-@@ -1296,8 +1711,10 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
-         # Dispatch class based on version
-         instance_cls = _SUPPORT_VERSION.get(version)
-         if instance_cls is None:
--            raise ValueError(
--                "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-+            supported_versions = ", ".join(
-+                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())])
-+            raise ValueError(f"Currently, MiniCPMV only supports versions "
-+                             f"{supported_versions}. Got version: {version}")
- 
-         # quant_config references base class members,
-         # so update values before init is called
-diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
-index 0b0d66ae7..ab54409e5 100644
---- a/vllm/model_executor/models/phi4mm_audio.py
-+++ b/vllm/model_executor/models/phi4mm_audio.py
-@@ -535,9 +535,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
-                                                   self.chunk_size,
-                                                   self.left_chunk)
- 
--        if xs_pad.is_cuda:
--            enc_streaming_mask = enc_streaming_mask.cuda()
--            xs_pad = xs_pad.cuda()
-+        device = xs_pad.device
-+        if device.type != "cpu":
-+            enc_streaming_mask = enc_streaming_mask.to(device)
-+            xs_pad = xs_pad.to(device)
- 
-         input_tensor = xs_pad
-         input_tensor, masks = self._forward_embeddings_core(
-@@ -554,8 +555,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
-         if chunk_size_nc is not None:
-             enc_streaming_mask_nc = self._streaming_mask(
-                 seq_len, batch_size, chunk_size_nc, left_chunk_nc)
--            if xs_pad.is_cuda:
--                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
-+            if device.type != "cpu":
-+                enc_streaming_mask_nc = enc_streaming_mask_nc.to(device)
-             if masks is not None:
-                 hs_mask_nc = masks & enc_streaming_mask_nc
-             else:
-diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
-index 8ae096536..472772ce6 100644
---- a/vllm/model_executor/models/qwen2_5_vl.py
-+++ b/vllm/model_executor/models/qwen2_5_vl.py
-@@ -249,8 +249,9 @@ class Qwen2_5_VisionAttention(nn.Module):
- 
-         # Detect attention implementation.
-         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
-         if self.attn_backend not in {
--                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.IPEX
-         }:
-             raise RuntimeError(
-                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
-@@ -321,24 +322,86 @@ class Qwen2_5_VisionAttention(nn.Module):
-             context_layer = rearrange(output,
-                                       "(b s) ... -> b s ...",
-                                       b=batch_size)
-+        elif self.attn_backend == _Backend.IPEX:
-+            from vllm._ipex_ops import ipex_ops
-+
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+
-+            output = torch.empty(
-+                q.shape,
-+                dtype=q.dtype,
-+                device=q.device)
-+            ipex_ops.varlen_attention(
-+                    q,
-+                    k,
-+                    v,
-+                    output,
-+                    cu_seqlens,
-+                    cu_seqlens,
-+                    None,
-+                    max_seqlen,
-+                    max_seqlen,
-+                    pdropout=0.0,
-+                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
-+                    zero_tensors=False,
-+                    is_causal=True,
-+                    return_softmax=False,
-+                    gen_=None,
-+                    window_size_left=-1,
-+                    window_size_right=-1,
-+                    logits_soft_cap=-1,
-+            )
-+            context_layer = rearrange(output,
-+                            "(b s) ... -> b s ...",
-+                            b=batch_size)
-         elif self.attn_backend == _Backend.TORCH_SDPA:
-             # Execute attention entry by entry for speed & less VRAM.
--            outputs = []
--            for i in range(1, len(cu_seqlens)):
--                start_idx = cu_seqlens[i - 1]
--                end_idx = cu_seqlens[i]
--                q_i = q[:, start_idx:end_idx]
--                k_i = k[:, start_idx:end_idx]
--                v_i = v[:, start_idx:end_idx]
--                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
--                                 for x in [q_i, k_i, v_i])
--                output_i = F.scaled_dot_product_attention(q_i,
--                                                          k_i,
--                                                          v_i,
--                                                          dropout_p=0.0)
--                output_i = rearrange(output_i, "b h s d -> b s h d ")
--                outputs.append(output_i)
--            context_layer = torch.cat(outputs, dim=1)
-+            # TODO(xiangyu): Maybe add attn_backend xpu?
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+            from vllm._ipex_ops import ipex_ops
-+            output = torch.empty(
-+                        (q.shape[0], q.shape[1], q.shape[2]),
-+                        dtype=q.dtype,
-+                        device=q.device)
-+            import math
-+            head_dim = q.shape[-1]
-+            scale = 1 / math.sqrt(head_dim)
-+            ipex_ops.varlen_attention(q, k, v, output,
-+                                    cu_seqlens,
-+                                    cu_seqlens,
-+                                    None,
-+                                    max_seqlen,
-+                                    max_seqlen,
-+                                    pdropout=0,
-+                                    softmax_scale=scale,
-+                                    zero_tensors=False,
-+                                    is_causal=False,
-+                                    return_softmax=False,
-+                                    window_size_left=-1,
-+                                    window_size_right=-1,
-+                                    gen_=None,
-+                                    logits_soft_cap=0
-+                                    )
-+
-+            context_layer = rearrange(output,
-+                                      "(b s) ... -> b s ...",
-+                                      b=batch_size)
-+            # outputs = []
-+            # for i in range(1, len(cu_seqlens)):
-+            #     start_idx = cu_seqlens[i - 1]
-+            #     end_idx = cu_seqlens[i]
-+            #     q_i = q[:, start_idx:end_idx]
-+            #     k_i = k[:, start_idx:end_idx]
-+            #     v_i = v[:, start_idx:end_idx]
-+            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-+            #                      for x in [q_i, k_i, v_i])
-+            #     output_i = F.scaled_dot_product_attention(q_i,
-+            #                                               k_i,
-+            #                                               v_i,
-+            #                                               dropout_p=0.0)
-+            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
-+            #     outputs.append(output_i)
-+            # context_layer = torch.cat(outputs, dim=1)
-         elif self.attn_backend == _Backend.XFORMERS:
-             from xformers import ops as xops
-             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -643,6 +706,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
-             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         elif self.attn_backend == _Backend.XFORMERS:
-             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-+        elif self.attn_backend == _Backend.IPEX:
-+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         return max_seqlen, seqlens
- 
-     def forward(
-@@ -1132,10 +1197,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-             if image_input is None and video_input is None:
-                 inputs_embeds = None
-             else:
--                if uses_mrope(self.config):
--                    assert positions.ndim == 2 and positions.size(0) == 3, (
--                        "multimodal section rotary embedding requires "
--                        f"(3, seq_len) positions, but got {positions.size()}")
-+                # if uses_mrope(self.config):
-+                #     assert positions.ndim == 2 and positions.size(0) == 3, (
-+                #         "multimodal section rotary embedding requires "
-+                #         f"(3, seq_len) positions, but got {positions.size()}")
-                 inputs_embeds = self.get_input_embeddings_v0(
-                     input_ids,
-                     image_input=image_input,
-diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
-index ad63bb4af..2e22799e1 100644
---- a/vllm/model_executor/models/qwen2_vl.py
-+++ b/vllm/model_executor/models/qwen2_vl.py
-@@ -275,8 +275,9 @@ class Qwen2VisionAttention(nn.Module):
- 
-         # Detect attention implementation.
-         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
-         if self.attn_backend not in {
--                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.IPEX
-         }:
-             raise RuntimeError(
-                 f"Qwen2-VL does not support {self.attn_backend} backend now.")
-@@ -346,24 +347,69 @@ class Qwen2VisionAttention(nn.Module):
-             context_layer = rearrange(output,
-                                       "(b s) ... -> b s ...",
-                                       b=batch_size)
-+        elif self.attn_backend == _Backend.IPEX:
-+            from vllm._ipex_ops import ipex_ops
 +
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++        if intermediate_tensors is not None:
++            inputs_embeds = None
 +
-+            output = torch.empty(
-+                q.shape,
-+                dtype=q.dtype,
-+                device=q.device)
-+            ipex_ops.varlen_attention(
-+                    q,
-+                    k,
-+                    v,
-+                    output,
-+                    cu_seqlens,
-+                    cu_seqlens,
-+                    None,
-+                    max_seqlen,
-+                    max_seqlen,
-+                    pdropout=0.0,
-+                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
-+                    zero_tensors=False,
-+                    is_causal=True,
-+                    return_softmax=False,
-+                    gen_=None,
-+                    window_size_left=-1,
-+                    window_size_right=-1,
-+                    logits_soft_cap=-1,
-+            )
-+            context_layer = rearrange(output,
-+                            "(b s) ... -> b s ...",
-+                            b=batch_size)
-         elif self.attn_backend == _Backend.TORCH_SDPA:
-             # Execute attention entry by entry for speed & less VRAM.
--            outputs = []
--            for i in range(1, len(cu_seqlens)):
--                start_idx = cu_seqlens[i - 1]
--                end_idx = cu_seqlens[i]
--                q_i = q[:, start_idx:end_idx]
--                k_i = k[:, start_idx:end_idx]
--                v_i = v[:, start_idx:end_idx]
--                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
--                                 for x in [q_i, k_i, v_i])
--                output_i = F.scaled_dot_product_attention(q_i,
--                                                          k_i,
--                                                          v_i,
--                                                          dropout_p=0.0)
--                output_i = rearrange(output_i, "b h s d -> b s h d ")
--                outputs.append(output_i)
--            context_layer = torch.cat(outputs, dim=1)
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+            from vllm._ipex_ops import ipex_ops
-+            output = torch.empty(
-+                        (q.shape[0], q.shape[1], q.shape[2]),
-+                        dtype=q.dtype,
-+                        device=q.device)
-+            import math
-+            head_dim = q.shape[-1]
-+            scale = 1 / math.sqrt(head_dim)
-+            ipex_ops.varlen_attention(q, k, v, output,
-+                                    cu_seqlens,
-+                                    cu_seqlens,
-+                                    None,
-+                                    max_seqlen,
-+                                    max_seqlen,
-+                                    pdropout=0,
-+                                    softmax_scale=scale,
-+                                    zero_tensors=False,
-+                                    is_causal=False,
-+                                    return_softmax=False,
-+                                    window_size_left=-1,
-+                                    window_size_right=-1,
-+                                    gen_=None,
-+                                    logits_soft_cap=0
-+                                    )
++        # NOTE: In v1, inputs_embeds is always generated at model runner from
++        # `get_multimodal_embeddings` and `get_input_embeddings`, this
++        # condition is only for v0 compatibility.
++        elif inputs_embeds is None:
++            image_input = self._parse_and_validate_image_input(**kwargs)
++            video_input = self._parse_and_validate_video_input(**kwargs)
 +
-+            context_layer = rearrange(output,
-+                                    "(b s) ... -> b s ...",
-+                                    b=batch_size)
-         elif self.attn_backend == _Backend.XFORMERS:
-             from xformers import ops as xops
-             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -626,6 +672,8 @@ class Qwen2VisionTransformer(nn.Module):
-             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         elif self.attn_backend == _Backend.XFORMERS:
-             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-+        elif self.attn_backend == _Backend.IPEX:
-+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         return max_seqlen, seqlens
- 
-     def forward(
-diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
-index 393ce41a9..550c5b273 100644
---- a/vllm/model_executor/models/qwen3.py
-+++ b/vllm/model_executor/models/qwen3.py
-@@ -135,11 +135,11 @@ class Qwen3Attention(nn.Module):
-         # Add qk-norm
-         q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
-                            self.head_dim)
--        q_by_head = self.q_norm(q_by_head)
-+        q_by_head = self.q_norm.forward(q_by_head.contiguous())
-         q = q_by_head.view(q.shape)
-         k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
-                            self.head_dim)
--        k_by_head = self.k_norm(k_by_head)
-+        k_by_head = self.k_norm.forward(k_by_head.contiguous())
-         k = k_by_head.view(k.shape)
-         q, k = self.rotary_emb(positions, q, k)
-         attn_output = self.attn(q, k, v)
-diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
-index 12899c280..951215ee0 100644
---- a/vllm/model_executor/models/qwen3_moe.py
-+++ b/vllm/model_executor/models/qwen3_moe.py
-@@ -225,12 +225,12 @@ class Qwen3MoeAttention(nn.Module):
-         # Add qk-norm
-         q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
-                            self.head_dim)
--        q_by_head = self.q_norm(q_by_head)
-+        q_by_head = self.q_norm.forward(q_by_head.contiguous())
-         q = q_by_head.view(q.shape)
- 
-         k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
-                            self.head_dim)
--        k_by_head = self.k_norm(k_by_head)
-+        k_by_head = self.k_norm.forward(k_by_head.contiguous())
-         k = k_by_head.view(k.shape)
-         q, k = self.rotary_emb(positions, q, k)
-         attn_output = self.attn(q, k, v)
-diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
-index 2aaac7798..6a832ca27 100644
---- a/vllm/model_executor/models/registry.py
-+++ b/vllm/model_executor/models/registry.py
-@@ -122,11 +122,13 @@ _TEXT_GENERATION_MODELS = {
-     "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
-     "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
-     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
-+    "SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"),
-     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
-     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
-     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
-     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
-     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
-+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
-     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
-     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
-     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
-diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
-index c6b411644..feb549d44 100644
---- a/vllm/model_executor/models/roberta.py
-+++ b/vllm/model_executor/models/roberta.py
-@@ -9,6 +9,7 @@ from torch import nn
- from transformers import RobertaConfig
- 
- from vllm.config import VllmConfig
-+from vllm.forward_context import get_forward_context
- from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
-                                                DispatchPooler, Pooler)
- from vllm.model_executor.layers.vocab_parallel_embedding import (
-@@ -19,7 +20,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
- from vllm.sequence import IntermediateTensors
- 
- from .bert_with_rope import BertWithRope, JinaRobertaModel
--from .interfaces import SupportsCrossEncoding, SupportsV0Only
-+from .interfaces import SupportsCrossEncoding
- 
- 
- class RobertaEmbedding(nn.Module):
-@@ -51,33 +52,12 @@ class RobertaEmbedding(nn.Module):
-     def forward(
-         self,
-         input_ids: torch.Tensor,
--        seq_lens: torch.Tensor,
-         position_ids: torch.Tensor,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-         input_shape = input_ids.size()
-         inputs_embeds = self.word_embeddings(input_ids)
- 
--        # Replace position ids because in RoBERTa models
--        # they have to start at padding_idx + 1 and ignore
--        # existing padding tokens
--        # References:
--        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
--        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
--        seq_lens_list = seq_lens.tolist()
--        new_pos_list = []
--        for positions, tokens in zip(position_ids.split(seq_lens_list),
--                                     input_ids.split(seq_lens_list)):
--            # Verify assumption that incoming position are
--            # always a sequence from 0 to N.
--            expected_pos = torch.arange(positions.size()[0],
--                                        dtype=torch.long,
--                                        device=inputs_embeds.device)
--            assert torch.equal(positions, expected_pos)
--            new_pos_list.append(
--                create_position_ids_from_input_ids(tokens, self.padding_idx))
--        position_ids = torch.cat(new_pos_list)
--
-         # Position embeddings.
-         position_embeddings = self.position_embeddings(position_ids)
-         if token_type_ids is None:
-@@ -119,6 +99,32 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
-        _pooler: An instance of Pooler used for pooling operations.
-    """
- 
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
++            if image_input is None and video_input is None:
++                inputs_embeds = None
++            else:
++                if uses_mrope(self.config):
++                    assert positions.ndim == 2 and positions.size(0) == 3, (
++                        "multimodal section rotary embedding requires "
++                        f"(3, seq_len) positions, but got {positions.size()}")
++                inputs_embeds = self.get_input_embeddings_v0(
++                    input_ids,
++                    image_input=image_input,
++                    video_input=video_input)
++                input_ids = None
++
++        if self.use_deepstack and inputs_embeds is not None and get_pp_group(
++        ).is_first_rank:
++            deepstack_input_embeds = self._get_deepstack_input_embeds(
++                inputs_embeds.size(0))
++        else:
++            deepstack_input_embeds = None
 +
-+    def forward(
++        hidden_states = self.language_model.model(
++            input_ids=input_ids,
++            positions=positions,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++            # args for deepstack
++            deepstack_input_embeds=deepstack_input_embeds,
++        )
++
++        if inputs_embeds is not None and get_pp_group().is_first_rank:
++            self._clear_deepstack_input_embeds(inputs_embeds.size(0))
++
++        return hidden_states
++
++    def compute_logits(
 +        self,
-+        input_ids: Optional[torch.Tensor],
-+        positions: torch.Tensor,
-+        token_type_ids: Optional[torch.Tensor] = None,
-+        intermediate_tensors: Optional[IntermediateTensors] = None,
-+        inputs_embeds: Optional[torch.Tensor] = None,
-+    ) -> torch.Tensor:
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
 +
-+        # Fix Roberta positions here outside of the CUDA graph.
-+        # Because we need the to extract the sequences from
-+        # input_ids the control flow is data dependent.
-+        replace_roberta_positions(input_ids=input_ids,
-+                                  position_ids=positions,
-+                                  padding_idx=self.padding_idx)
-+
-+        return self.model(input_ids=input_ids,
-+                          position_ids=positions,
-+                          token_type_ids=token_type_ids,
-+                          inputs_embeds=inputs_embeds,
-+                          intermediate_tensors=intermediate_tensors)
-+
-     def _build_model(self,
-                      vllm_config: VllmConfig,
-                      prefix: str = "") -> Union[BertModel, BertWithRope]:
-@@ -147,8 +153,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
-         return loader.load_weights(weights_list, mapper=mapper)
- 
- 
--class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
--                                       SupportsV0Only):
-+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
-     """A model that uses Roberta to provide embedding functionalities.
- 
-    This class encapsulates the BertModel and provides an interface for
-@@ -175,6 +180,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
-     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-         super().__init__()
-         config = vllm_config.model_config.hf_config
-+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
- 
-         self.num_labels = config.num_labels
-         self.roberta = BertModel(vllm_config=vllm_config,
-@@ -216,6 +222,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
-         inputs_embeds: Optional[torch.Tensor] = None,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-+        replace_roberta_positions(input_ids=input_ids,
-+                                  position_ids=positions,
-+                                  padding_idx=self.padding_idx)
-         return self.roberta(input_ids=input_ids,
-                             position_ids=positions,
-                             inputs_embeds=inputs_embeds,
-@@ -245,3 +254,36 @@ def create_position_ids_from_input_ids(input_ids,
-                            past_key_values_length) * mask
- 
-     return incremental_indices.long() + padding_idx
-+
-+
-+def replace_roberta_positions(input_ids: torch.Tensor,
-+                              position_ids: torch.Tensor,
-+                              padding_idx: int) -> None:
-+
-+    seq_lens: Optional[torch.Tensor] = None
-+    attn_metadata = get_forward_context().attn_metadata
-+    if attn_metadata is not None:  # can be None during warmup
-+        if isinstance(attn_metadata, dict):
-+            attn_metadata = next(iter(attn_metadata.values()))
-+        # TODO: remove "seq_lens_tensor" after V0 is removed
-+        seq_lens = getattr(attn_metadata, "seq_lens_tensor",
-+                           getattr(attn_metadata, "seq_lens", None))
-+
-+    if seq_lens is not None:
-+        assert isinstance(seq_lens, torch.Tensor)
-+
-+        # Replace position ids because in RoBERTa models
-+        # they have to start at padding_idx + 1 and ignore
-+        # existing padding tokens
-+        # References:
-+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-+        token_list = torch.split(input_ids[:torch.sum(seq_lens)],
-+                                 seq_lens.tolist())
-+
-+        offset = 0
-+        for tokens in token_list:
-+            length = tokens.shape[0]
-+            position_ids[offset:offset+length] = \
-+                create_position_ids_from_input_ids(tokens, padding_idx)
-+            offset = offset + length
-diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="language_model",
++            connector="model.visual.merger",
++            tower_model="model.visual.",
++        )
+diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
 new file mode 100644
-index 000000000..34a87a6a6
+index 000000000..a800e94ab
 --- /dev/null
-+++ b/vllm/model_executor/models/seed_oss.py
-@@ -0,0 +1,487 @@
++++ b/vllm/model_executor/models/qwen3_vl_moe.py
+@@ -0,0 +1,344 @@
 +# SPDX-License-Identifier: Apache-2.0
 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 +
-+# Copyright 2025 The Seed team.
-+# Copyright 2023 The vLLM team.
-+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++# Copyright 2025 The vLLM team.
++# Copyright 2025 The Qwen Team.
++# Copyright 2025 The HuggingFace Inc. team.
++# All rights reserved.
 +#
 +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 +# and OPT implementations in this library. It has been modified from its
@@ -15548,307 +7270,62 @@ index 000000000..34a87a6a6
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
-+"""Inference-only SeedOss model compatible with HuggingFace weights."""
++"""Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights."""
++import typing
 +from collections.abc import Iterable
-+from typing import Optional, Union
++from typing import Callable, Optional, Union
 +
 +import torch
-+from torch import nn
-+from transformers import PretrainedConfig as SeedOssConfig
++from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
++    Qwen3VLMoeConfig)
 +
-+from vllm.attention import Attention, AttentionType
 +from vllm.compilation.decorators import support_torch_compile
-+from vllm.config import CacheConfig, VllmConfig
-+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
 +from vllm.logger import init_logger
-+from vllm.model_executor.layers.activation import SiluAndMul
-+from vllm.model_executor.layers.layernorm import RMSNorm
-+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-+                                               QKVParallelLinear,
-+                                               RowParallelLinear)
 +from vllm.model_executor.layers.logits_processor import LogitsProcessor
-+from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.layers.rotary_embedding import get_rope
-+from vllm.model_executor.layers.vocab_parallel_embedding import (
-+    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 +from vllm.model_executor.model_loader.weight_utils import (
 +    default_weight_loader, maybe_remap_kv_scale_name)
-+from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
 +from vllm.sequence import IntermediateTensors
 +
-+from .interfaces import SupportsLoRA, SupportsPP
-+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-+                    make_empty_intermediate_tensors_factory, make_layers,
-+                    maybe_prefix)
++from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
++from .qwen3_vl import (Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder,
++                       Qwen3VLForConditionalGeneration,
++                       Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo)
++from .utils import is_pp_missing_parameter, maybe_prefix
 +
 +logger = init_logger(__name__)
 +
 +
-+class SeedOssMLP(nn.Module):
-+
-+    def __init__(
-+        self,
-+        hidden_size: int,
-+        intermediate_size: int,
-+        hidden_act: str,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> None:
-+        super().__init__()
-+        self.gate_up_proj = MergedColumnParallelLinear(
-+            hidden_size,
-+            [intermediate_size] * 2,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.gate_up_proj",
-+        )
-+        self.down_proj = RowParallelLinear(
-+            intermediate_size,
-+            hidden_size,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.down_proj",
-+        )
-+        if hidden_act != "silu":
-+            raise ValueError(f"Unsupported activation: {hidden_act}. "
-+                             "Only silu is supported for now.")
-+        self.act_fn = SiluAndMul()
-+
-+    def forward(self, x):
-+        gate_up, _ = self.gate_up_proj(x)
-+        x = self.act_fn(gate_up)
-+        x, _ = self.down_proj(x)
-+        return x
-+
-+
-+class SeedOssAttention(nn.Module):
-+
-+    def __init__(
-+        self,
-+        hidden_size: int,
-+        num_heads: int,
-+        num_kv_heads: int,
-+        head_dim: int,
-+        max_position: int = 4096 * 32,
-+        rope_theta: float = 10000,
-+        cache_config: Optional[CacheConfig] = None,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        rope_scaling: Optional[tuple] = None,
-+        prefix: str = "",
-+        attn_type: str = AttentionType.DECODER,
-+    ) -> None:
-+        super().__init__()
-+        self.hidden_size = hidden_size
-+        tp_size = get_tensor_model_parallel_world_size()
-+        self.total_num_heads = num_heads
-+        assert self.total_num_heads % tp_size == 0
-+        self.num_heads = self.total_num_heads // tp_size
-+        self.total_num_kv_heads = num_kv_heads
-+        self.head_dim = head_dim
-+        if self.total_num_kv_heads >= tp_size:
-+            # Number of KV heads is greater than TP size, so we partition
-+            # the KV heads across multiple tensor parallel GPUs.
-+            assert self.total_num_kv_heads % tp_size == 0
-+        else:
-+            # Number of KV heads is less than TP size, so we replicate
-+            # the KV heads across multiple tensor parallel GPUs.
-+            assert tp_size % self.total_num_kv_heads == 0
-+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-+        self.q_size = self.num_heads * self.head_dim
-+        self.kv_size = self.num_kv_heads * self.head_dim
-+        self.scaling = self.head_dim**-0.5
-+        self.rope_theta = rope_theta
-+
-+        self.qkv_proj = QKVParallelLinear(
-+            hidden_size,
-+            self.head_dim,
-+            self.total_num_heads,
-+            self.total_num_kv_heads,
-+            bias=True,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.qkv_proj",
-+        )
-+        self.o_proj = RowParallelLinear(
-+            self.total_num_heads * self.head_dim,
-+            hidden_size,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.o_proj",
-+        )
-+
-+        self.rotary_emb = get_rope(
-+            self.head_dim,
-+            rotary_dim=self.head_dim,
-+            max_position=max_position,
-+            base=self.rope_theta,
-+            rope_scaling=rope_scaling,
-+        )
-+        self.attn = Attention(
-+            self.num_heads,
-+            self.head_dim,
-+            self.scaling,
-+            num_kv_heads=self.num_kv_heads,
-+            cache_config=cache_config,
-+            quant_config=quant_config,
-+            attn_type=attn_type,
-+            prefix=f"{prefix}.attn",
-+        )
-+
-+    def forward(
-+        self,
-+        positions: torch.Tensor,
-+        hidden_states: torch.Tensor,
-+    ) -> torch.Tensor:
-+        qkv, _ = self.qkv_proj(hidden_states)
-+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-+        q, k = self.rotary_emb(positions, q, k)
-+        attn_output = self.attn(q, k, v)
-+        output, _ = self.o_proj(attn_output)
-+        return output
-+
-+
-+class SeedOssDecoderLayer(nn.Module):
-+
-+    def __init__(
-+        self,
-+        config: SeedOssConfig,
-+        cache_config: Optional[CacheConfig] = None,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> None:
-+        super().__init__()
-+        self.hidden_size = config.hidden_size
-+        # Requires transformers > 4.32.0
-+        rope_theta = getattr(config, "rope_theta", 1000000)
-+        rope_scaling = getattr(config, "rope_scaling", None)
-+
-+        # By default, SeedOss uses causal attention as it is a
-+        # decoder-only model.
-+        # You can override the HF config with `is_causal=False` to enable
-+        # bidirectional attention, which is used in some embedding models
-+        if getattr(config, "is_causal", True):
-+            attn_type = AttentionType.DECODER
-+        else:
-+            attn_type = AttentionType.ENCODER_ONLY
-+
-+        self.self_attn = SeedOssAttention(
-+            hidden_size=self.hidden_size,
-+            num_heads=config.num_attention_heads,
-+            max_position=config.max_position_embeddings,
-+            num_kv_heads=config.num_key_value_heads,
-+            head_dim=config.head_dim,
-+            rope_theta=rope_theta,
-+            cache_config=cache_config,
-+            quant_config=quant_config,
-+            rope_scaling=rope_scaling,
-+            prefix=f"{prefix}.self_attn",
-+            attn_type=attn_type,
-+        )
-+        self.mlp = SeedOssMLP(
-+            hidden_size=self.hidden_size,
-+            intermediate_size=config.intermediate_size,
-+            hidden_act=config.hidden_act,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.mlp",
-+        )
-+        self.input_layernorm = RMSNorm(config.hidden_size,
-+                                       eps=config.rms_norm_eps)
-+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-+                                                eps=config.rms_norm_eps)
-+
-+    def forward(
-+        self,
-+        positions: torch.Tensor,
-+        hidden_states: torch.Tensor,
-+        residual: Optional[torch.Tensor],
-+    ) -> tuple[torch.Tensor, torch.Tensor]:
-+        # Self Attention
-+        if residual is None:
-+            residual = hidden_states
-+            hidden_states = self.input_layernorm(hidden_states)
-+        else:
-+            hidden_states, residual = self.input_layernorm(
-+                hidden_states, residual)
-+        hidden_states = self.self_attn(
-+            positions=positions,
-+            hidden_states=hidden_states,
-+        )
++class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
 +
-+        # Fully Connected
-+        hidden_states, residual = self.post_attention_layernorm(
-+            hidden_states, residual)
-+        hidden_states = self.mlp(hidden_states)
-+        return hidden_states, residual
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3VLMoeConfig)
 +
 +
 +@support_torch_compile(
 +    dynamic_arg_dims={
 +        "input_ids": 0,
++        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
++        # otherwise (seq_len, ).
 +        "positions": -1,
 +        "intermediate_tensors": 0,
 +        "inputs_embeds": 0,
++        # the same shape as input_embeds
++        "deepstack_input_embeds": 0
 +    })
-+class SeedOssModel(nn.Module):
-+
-+    def __init__(self,
-+                 *,
-+                 vllm_config: VllmConfig,
-+                 prefix: str = "",
-+                 decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer):
-+        super().__init__()
++class Qwen3MoeLLMModel(Qwen3MoeModel):
 +
-+        config = vllm_config.model_config.hf_config
-+        cache_config = vllm_config.cache_config
-+        quant_config = vllm_config.quant_config
-+
-+        # TODO (@robertgshaw2): see if this can be moved out
-+        if (cache_config.sliding_window is not None
-+                and hasattr(config, "max_window_layers")):
-+            assert config.max_window_layers == config.num_hidden_layers, (
-+                "Sliding window for some but all layers is not supported. "
-+                "This model uses sliding window but `max_window_layers` = {} "
-+                "is less than `num_hidden_layers` = {}. Please open an issue "
-+                "to discuss this feature.".format(
-+                    config.max_window_layers,
-+                    config.num_hidden_layers,
-+                ))
-+
-+        self.config = config
-+        self.quant_config = quant_config
-+        self.vocab_size = config.vocab_size
-+
-+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-+                                            and get_pp_group().is_last_rank):
-+            self.embed_tokens = VocabParallelEmbedding(
-+                config.vocab_size,
-+                config.hidden_size,
-+                quant_config=quant_config,
-+                prefix=f"{prefix}.embed_tokens",
-+            )
-+        else:
-+            self.embed_tokens = PPMissingLayer()
-+
-+        # Use the provided decoder layer type or default to SeedDecoderLayer
-+        decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer
-+        self.start_layer, self.end_layer, self.layers = make_layers(
-+            config.num_hidden_layers,
-+            lambda prefix: decoder_layer_type(config=config,
-+                                              cache_config=cache_config,
-+                                              quant_config=quant_config,
-+                                              prefix=prefix),
-+            prefix=f"{prefix}.layers",
-+        )
-+
-+        self.make_empty_intermediate_tensors = (
-+            make_empty_intermediate_tensors_factory(
-+                ["hidden_states", "residual"], config.hidden_size))
-+        if get_pp_group().is_last_rank:
-+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-+        else:
-+            self.norm = PPMissingLayer()
-+
-+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-+        return self.embed_tokens(input_ids)
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        if not get_pp_group().is_first_rank:
++            assert self.start_layer >= len(
++                vllm_config.model_config.hf_config.vision_config.
++                deepstack_visual_indexes), (
++                    "start_layer should be greater than or equal to "
++                    "len(deepstack_visual_indexes)")
 +
 +    def forward(
 +        self,
@@ -15856,6 +7333,7 @@ index 000000000..34a87a6a6
 +        positions: torch.Tensor,
 +        intermediate_tensors: Optional[IntermediateTensors] = None,
 +        inputs_embeds: Optional[torch.Tensor] = None,
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
 +    ) -> Union[torch.Tensor, IntermediateTensors]:
 +        if get_pp_group().is_first_rank:
 +            if inputs_embeds is not None:
@@ -15867,12 +7345,21 @@ index 000000000..34a87a6a6
 +            assert intermediate_tensors is not None
 +            hidden_states = intermediate_tensors["hidden_states"]
 +            residual = intermediate_tensors["residual"]
-+        for layer in self.layers[self.start_layer:self.end_layer]:
++        for layer_idx, layer in enumerate(
++                self.layers[self.start_layer:self.end_layer]):
++            layer_idx = layer_idx + self.start_layer
++
 +            hidden_states, residual = layer(
 +                positions,
 +                hidden_states,
 +                residual,
 +            )
++
++            if deepstack_input_embeds is not None and \
++                    layer_idx in range(0, len(deepstack_input_embeds)):
++                hidden_states = hidden_states + deepstack_input_embeds[
++                    f"deepstack_input_embeds_{layer_idx}"]
++
 +        if not get_pp_group().is_last_rank:
 +            return IntermediateTensors({
 +                "hidden_states": hidden_states,
@@ -15881,6 +7368,23 @@ index 000000000..34a87a6a6
 +        hidden_states, _ = self.norm(hidden_states, residual)
 +        return hidden_states
 +
++    def load_fused_expert_weights(self, name: str, params_dict: dict,
++                                  loaded_weight: torch.Tensor, shard_id: str,
++                                  num_experts: int):
++        param = params_dict[name]
++        weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
++        for expert_id in range(num_experts):
++            curr_expert_weight = loaded_weight[expert_id]
++            success = weight_loader(param,
++                                    curr_expert_weight,
++                                    name,
++                                    shard_id,
++                                    expert_id,
++                                    return_success=True)
++            if not success:
++                return False
++        return True
++
 +    def load_weights(self, weights: Iterable[tuple[str,
 +                                                   torch.Tensor]]) -> set[str]:
 +        stacked_params_mapping = [
@@ -15891,657 +7395,595 @@ index 000000000..34a87a6a6
 +            ("gate_up_proj", "gate_proj", 0),
 +            ("gate_up_proj", "up_proj", 1),
 +        ]
-+        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        # Skip loading extra parameters for GPTQ/modelopt models.
++        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
++                           ".v_scale", "_v_scale", ".weight_scale",
++                           "_weight_scale", ".input_scale", "_input_scale")
++        params_dict = dict(self.named_parameters())
 +        loaded_params: set[str] = set()
++        expert_params_mapping = self.get_expert_mapping()
++        is_fused_expert = False
++        fused_expert_params_mapping = [
++            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
++            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
++        ]
++        num_experts = self.config.num_experts
 +        for name, loaded_weight in weights:
-+            if "rotary_emb.inv_freq" in name:
-+                continue
-+            if (self.quant_config is not None and
-+                (scale_name := self.quant_config.get_cache_scale(name))):
-+                # Loading kv cache quantization scales
-+                param = params_dict[scale_name]
-+                weight_loader = getattr(param, "weight_loader",
-+                                        default_weight_loader)
-+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-+                                 loaded_weight[0])
-+                weight_loader(param, loaded_weight)
-+                loaded_params.add(scale_name)
-+                continue
 +            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if ("experts.gate_up_proj" in name
++                        or "experts.down_proj" in name):
++                    is_fused_expert = True
++                    expert_params_mapping = fused_expert_params_mapping
++
++                # Skip non-stacked layers and experts (experts handled below).
 +                if weight_name not in name:
 +                    continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if "mlp.experts" in name:
++                    continue
 +                name = name.replace(weight_name, param_name)
-+                # Skip loading extra bias for GPTQ models.
-+                if name.endswith(".bias") and name not in params_dict:
++                # Skip loading extra parameters for GPTQ/modelopt models.
++                if name.endswith(ignore_suffixes) and name not in params_dict:
 +                    continue
++                # Skip layers on other devices.
 +                if is_pp_missing_parameter(name, self):
 +                    continue
-+                param = params_dict[name]
-+                weight_loader = param.weight_loader
-+                weight_loader(param, loaded_weight, shard_id)
-+                break
-+            else:
-+                # Skip loading extra bias for GPTQ models.
-+                if name.endswith(".bias") and name not in params_dict:
-+                    continue
-+                # Remapping the name of FP8 kv-scale.
-+                name = maybe_remap_kv_scale_name(name, params_dict)
-+                if name is None:
-+                    continue
-+                if is_pp_missing_parameter(name, self):
++                if name.endswith("scale"):
++                    # Remapping the name of FP8 kv-scale.
++                    name = maybe_remap_kv_scale_name(name, params_dict)
++                    if name is None:
++                        continue
++                if name not in params_dict:
 +                    continue
 +                param = params_dict[name]
 +                weight_loader = getattr(param, "weight_loader",
 +                                        default_weight_loader)
-+                weight_loader(param, loaded_weight)
++                if weight_loader == default_weight_loader:
++                    weight_loader(param, loaded_weight)
++                else:
++                    weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                is_expert_weight = False
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    # Anyway, this is an expert weight and should not be
++                    # attempted to load as other weights later
++                    is_expert_weight = True
++                    name_mapped = name.replace(weight_name, param_name)
++                    if is_fused_expert:
++                        loaded_weight = loaded_weight.transpose(-1,
++                                                                -2)  # no bias
++                        if "experts.gate_up_proj" in name:
++                            loaded_weight = loaded_weight.chunk(2, dim=-2)
++                            success_w1 = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight[0],
++                                "w1", num_experts)
++                            success_w3 = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight[1],
++                                "w3", num_experts)
++                            success = success_w1 and success_w3
++                        else:
++                            # down_proj
++                            success = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight,
++                                shard_id, num_experts)
++                    else:
++                        if is_pp_missing_parameter(name_mapped, self):
++                            continue
++                        # Skip loading extra parameters for GPTQ/modelopt models
++                        if name_mapped.endswith(
++                                ignore_suffixes
++                        ) and name_mapped not in params_dict:
++                            continue
++                        param = params_dict[name_mapped]
++                        # We should ask the weight loader to return success or
++                        # not here since otherwise we may skip experts with
++                        # other available replicas.
++                        weight_loader = typing.cast(Callable[..., bool],
++                                                    param.weight_loader)
++                        success = weight_loader(param,
++                                                loaded_weight,
++                                                name_mapped,
++                                                shard_id=shard_id,
++                                                expert_id=expert_id,
++                                                return_success=True)
++                    if success:
++                        name = name_mapped
++                        break
++                else:
++                    if is_expert_weight:
++                        # We've checked that this is an expert weight
++                        # However it's not mapped locally to this rank
++                        # So we simply skip it
++                        continue
++                    # Skip loading extra parameters for GPTQ/modelopt models.
++                    if name.endswith(
++                            ignore_suffixes) and name not in params_dict:
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    if name.endswith("kv_scale"):
++                        remapped_kv_scale_name = name.replace(
++                            ".kv_scale", ".attn.kv_scale")
++                        if remapped_kv_scale_name not in params_dict:
++                            logger.warning_once(
++                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
++                                name,
++                                remapped_kv_scale_name,
++                            )
++                            continue
++                        else:
++                            name = remapped_kv_scale_name
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
 +            loaded_params.add(name)
 +        return loaded_params
 +
 +
-+class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
++class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
 +
 +    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__()
-+        config = vllm_config.model_config.hf_config
-+        quant_config = vllm_config.quant_config
-+        lora_config = vllm_config.lora_config
-+
-+        self.config = config
-+        self.lora_config = lora_config
++        super(Qwen3MoeForCausalLM, self).__init__()
++        self.config = vllm_config.model_config.hf_config.text_config
++        self.quant_config = vllm_config.quant_config
++        self.model = Qwen3MoeLLMModel(vllm_config=vllm_config,
++                                      prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(self.config.vocab_size,
++                                      self.config.hidden_size,
++                                      quant_config=self.quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.logits_processor = LogitsProcessor(self.config.vocab_size)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
 +
-+        self.quant_config = quant_config
-+        self.model = SeedOssModel(vllm_config=vllm_config,
-+                                  prefix=maybe_prefix(prefix, "model"))
 +
-+        if get_pp_group().is_last_rank:
-+            if config.tie_word_embeddings:
-+                self.lm_head = self.model.embed_tokens
-+            else:
-+                self.lm_head = ParallelLMHead(config.vocab_size,
-+                                              config.hidden_size,
-+                                              quant_config=quant_config,
-+                                              prefix=maybe_prefix(
-+                                                  prefix, "lm_head"))
-+        else:
-+            self.lm_head = PPMissingLayer()
++@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
++                                        info=Qwen3VLMoeProcessingInfo,
++                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
++class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
 +
-+        self.logits_processor = LogitsProcessor(config.vocab_size)
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3VLForConditionalGeneration, self).__init__()
++        config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
 +
-+        self.make_empty_intermediate_tensors = (
-+            self.model.make_empty_intermediate_tensors)
++        self.config = config
++        self.multimodal_config = multimodal_config
 +
-+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-+        return self.model.get_input_embeddings(input_ids)
++        self.visual = Qwen3_VisionTransformer(
++            config.vision_config,
++            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
++            quant_config=self._maybe_ignore_quant_config(quant_config),
++            prefix=maybe_prefix(prefix, "visual"),
++        )
 +
-+    def forward(
-+        self,
-+        input_ids: torch.Tensor,
-+        positions: torch.Tensor,
-+        intermediate_tensors: Optional[IntermediateTensors] = None,
-+        inputs_embeds: Optional[torch.Tensor] = None,
-+    ) -> Union[torch.Tensor, IntermediateTensors]:
-+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-+                                   inputs_embeds)
-+        return hidden_states
++        self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,
++                                                     prefix=maybe_prefix(
++                                                         prefix,
++                                                         "language_model"))
 +
-+    def compute_logits(
-+        self,
-+        hidden_states: torch.Tensor,
-+        sampling_metadata: SamplingMetadata,
-+    ) -> Optional[torch.Tensor]:
-+        logits = self.logits_processor(self.lm_head, hidden_states,
-+                                       sampling_metadata)
-+        return logits
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
 +
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(
-+            self,
-+            skip_prefixes=(["lm_head."]
-+                           if self.config.tie_word_embeddings else None),
-+        )
-+        return loader.load_weights(weights)
-diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
-index 3630f59f5..62566d8f6 100644
---- a/vllm/model_executor/models/siglip.py
-+++ b/vllm/model_executor/models/siglip.py
-@@ -12,6 +12,7 @@ from torch import nn
- from transformers import SiglipVisionConfig
- 
- from vllm.attention.layer import MultiHeadAttention
-+from vllm.attention.layer import SelfMultiHeadAttention
++        self.use_deepstack = hasattr(config.vision_config,
++                                     'deepstack_visual_indexes')
++        self.deepstack_num_level = len(
++            config.vision_config.deepstack_visual_indexes
++        ) if self.use_deepstack else 0
++        # register buffer for deepstack
++        self.deepstack_input_embeds = [
++            torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
++                        config.text_config.hidden_size)
++            for _ in range(self.deepstack_num_level)
++        ] if self.use_deepstack else None
+diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
+index 7d7654e84..81e460413 100644
+--- a/vllm/model_executor/models/registry.py
++++ b/vllm/model_executor/models/registry.py
+@@ -142,6 +142,7 @@ _TEXT_GENERATION_MODELS = {
+     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
+     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
++    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
+     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
+     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
+@@ -215,6 +216,7 @@ _MULTIMODAL_MODELS = {
+     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
+     "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
+     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
++    "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
+     "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
+     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
+@@ -256,14 +258,33 @@ _MULTIMODAL_MODELS = {
+     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
+     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
+     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
+-    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+-    "UltravoxModel": ("ultravox", "UltravoxModel"),
++    "Qwen2_5_VLForConditionalGeneration": (
++        "qwen2_5_vl",
++        "Qwen2_5_VLForConditionalGeneration",
++    ),
++    "Qwen2AudioForConditionalGeneration": (
++        "qwen2_audio",
++        "Qwen2AudioForConditionalGeneration",
++    ),
++    "Qwen2_5OmniModel": (
++        "qwen2_5_omni_thinker",
++        "Qwen2_5OmniThinkerForConditionalGeneration",
++    ),
++    "Qwen2_5OmniForConditionalGeneration": (
++        "qwen2_5_omni_thinker",
++        "Qwen2_5OmniThinkerForConditionalGeneration",
++    ),
++    "Qwen3OmniMoeForConditionalGeneration": (
++        "qwen3_omni_moe_thinker",
++        "Qwen3OmniMoeThinkerForConditionalGeneration",
++    ),
++    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
++    "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),  # noqa: E501
++    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
+     "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
+     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
+     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
++    "UltravoxModel": ("ultravox", "UltravoxModel"),
+     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
+     # [Encoder-decoder]
+     "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
+diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
+index c6244fb3b..a86700fe6 100644
+--- a/vllm/model_executor/models/siglip2navit.py
++++ b/vllm/model_executor/models/siglip2navit.py
+@@ -13,6 +13,7 @@ from torch.nn import functional as F
+ from transformers import Siglip2VisionConfig
+ from transformers.configuration_utils import PretrainedConfig
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import QuantizationConfig
  from vllm.distributed import divide, get_tensor_model_parallel_world_size
  from vllm.model_executor.layers.activation import get_act_fn
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-@@ -177,7 +178,9 @@ class SiglipAttention(nn.Module):
-         self.tp_size = get_tensor_model_parallel_world_size()
-         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+@@ -236,7 +237,15 @@ class Siglip2Attention(nn.Module):
+         self.use_rope = config.use_rope
  
--        self.attn = MultiHeadAttention(self.num_heads_per_partition,
-+        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
-+        #                                self.head_dim, self.scale)
-+        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition,
-                                        self.head_dim, self.scale)
- 
-     def forward(
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.head_dim, dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                 _Backend.ROCM_AITER_FA
+@@ -280,7 +289,10 @@ class Siglip2Attention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
+             attn_output = flash_attn_varlen_func(
+                 queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                 max_seqlen).reshape(seq_length, -1)
 diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
-index ac6a659bb..fa25ee8e1 100644
+index c16aa5ac6..0b58caa3e 100644
 --- a/vllm/model_executor/models/vision.py
 +++ b/vllm/model_executor/models/vision.py
-@@ -97,6 +97,8 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-             else:
-                 # For Volta and Turing GPUs, use xformers instead.
-                 selected_backend = _Backend.XFORMERS
-+        elif current_platform.is_xpu:
-+            selected_backend = _Backend.IPEX
-         else:
-             # Default to torch SDPA for other non-GPU platforms.
-             selected_backend = _Backend.TORCH_SDPA
-diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
-index c4530c1df..6a1bb9a78 100644
---- a/vllm/platforms/xpu.py
-+++ b/vllm/platforms/xpu.py
-@@ -4,6 +4,7 @@
- import os
- from typing import TYPE_CHECKING, Optional
- 
-+import os
+@@ -7,7 +7,6 @@ from typing import Final, Generic, Optional, Protocol, TypeVar, Union
  import torch
+ from transformers import PretrainedConfig
  
- import vllm.envs as envs
-@@ -20,6 +21,23 @@ else:
+-from vllm.attention.selector import get_env_variable_attn_backend
+ from vllm.logger import init_logger
+ from vllm.platforms import _Backend, current_platform
  
- logger = init_logger(__name__)
+@@ -68,18 +67,18 @@ def get_vision_encoder_info(
+     raise NotImplementedError(msg)
  
-+def device_id_to_physical_device_id(device_id: int) -> int:
-+    if "ZE_AFFINITY_MASK" in os.environ:
-+        device_ids = os.environ["ZE_AFFINITY_MASK"].split(",")
-+        if device_ids == [""]:
-+            msg = (
-+                "ZE_AFFINITY_MASK is set to empty string, which means"
-+                " GPU support is disabled. If you are using ray, please unset"
-+                " the environment variable `ZE_AFFINITY_MASK` inside the"
-+                " worker/actor. "
-+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-+                " more information.")
-+            raise RuntimeError(msg)
-+        physical_device_id = device_ids[device_id]
-+        return int(physical_device_id)
-+    else:
-+        return device_id
+ 
+-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
++def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend:
+     """
+     Get the available attention backend for Vision Transformer.
+     """
+-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
++    # Lazy import to avoid circular dependency
++    from vllm.attention.selector import get_env_variable_attn_backend
+ 
+     selected_backend: Optional[_Backend] = get_env_variable_attn_backend()
+     if selected_backend is not None:
+         return selected_backend
+ 
+-    return current_platform.get_vit_attn_backend(support_fa)
+-
++    return current_platform.get_vit_attn_backend(head_size, dtype)
+ 
+ def resolve_visual_encoder_outputs(
+     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+@@ -122,4 +121,4 @@ def resolve_visual_encoder_outputs(
+     uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+     if post_layer_norm is not None and uses_last_layer:
+         hs_pool[-1] = post_layer_norm(encoder_outputs)
+-    return torch.cat(hs_pool, dim=-1)
+\ No newline at end of file
++    return torch.cat(hs_pool, dim=-1)
+diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
+index df6e19da8..8c6627186 100644
+--- a/vllm/multimodal/video.py
++++ b/vllm/multimodal/video.py
+@@ -159,6 +159,20 @@ class OpenCVVideoBackend(VideoLoader):
+         assert i == num_frames, (f"Expected reading {num_frames} frames, "
+                                  f"but only loaded {i} frames from video.")
+ 
++        # Use transformers transformers.video_utils.VideoMetadata format
++        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
++        # can cause incorrect timestamp calculation without num_frames=-1.
++        metadata = {
++            "total_num_frames": num_frames,
++            "fps": num_frames / duration,
++            "duration": duration,
++            "video_backend": "opencv",
++            "frames_indices": list(range(num_frames)),
++            # extra field used to control hf processor's video
++            # sampling behavior
++            "do_sample_frames": num_frames == total_frames_num,
++        }
 +
+         return frames, metadata
+ 
+ 
+diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
+index e40b6eb2b..77c9a012b 100644
+--- a/vllm/platforms/cuda.py
++++ b/vllm/platforms/cuda.py
+@@ -209,18 +209,24 @@ class CudaPlatformBase(Platform):
+         return torch.cuda.max_memory_allocated(device)
+ 
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+-        if cls.has_device_capability(80) and support_fa:
+-            from transformers.utils import is_flash_attn_2_available
+-            if is_flash_attn_2_available():
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
++        if dtype not in (torch.float16, torch.bfloat16):
++            return _Backend.XFORMERS
++
++        if cls.has_device_capability(80):
++            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
++            from vllm.attention.selector import is_attn_backend_supported
++            is_default_fa_supported = is_attn_backend_supported(
++                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False)
++            if is_default_fa_supported:
+                 return _Backend.FLASH_ATTN
+-            logger.warning_once(
+-                "Current `vllm-flash-attn` has a bug inside vision "
+-                "module, so we use xformers backend instead. You can "
+-                "run `pip install flash-attn` to use flash-attention "
+-                "backend.")
+-        # Fallback for Volta/Turing GPUs or FA not supported
+-        return _Backend.XFORMERS
++            else:
++                # Fallback to XFORMERS
++                return _Backend.XFORMERS
++        else:
++            # Fallback for Volta/Turing GPUs or FA not supported
++            return _Backend.XFORMERS
+ 
+     @classmethod
+     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
+index 59aa46818..054d08c3a 100644
+--- a/vllm/platforms/interface.py
++++ b/vllm/platforms/interface.py
+@@ -192,7 +192,8 @@ class Platform:
+             return device_id
  
- class XPUPlatform(Platform):
-     _enum = PlatformEnum.XPU
-@@ -30,7 +48,7 @@ class XPUPlatform(Platform):
-     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
-     ray_device_key: str = "GPU"
-     dist_backend: str = "ccl"  # ccl | xccl
--    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
-+    device_control_env_var: str = "ZE_AFFINITY_MASK"
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
+         return _Backend.TORCH_SDPA
+ 
+     @classmethod
+diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
+index f4d136c5e..bb8bff48c 100644
+--- a/vllm/platforms/rocm.py
++++ b/vllm/platforms/rocm.py
+@@ -175,15 +175,15 @@ class RocmPlatform(Platform):
+     ]
+ 
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+-        if support_fa:
+-            if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
+-                    and on_gfx9()):
+-                # Note: AITER FA is only supported for Qwen-VL models.
+-                # TODO: Add support for other VL models in their model class.
+-                return _Backend.ROCM_AITER_FA
+-            if on_gfx9():
+-                return _Backend.FLASH_ATTN
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
++        if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
++                and on_gfx9()):
++            # Note: AITER FA is only supported for Qwen-VL models.
++            # TODO: Add support for other VL models in their model class.
++            return _Backend.ROCM_AITER_FA
++        if on_gfx9():
++            return _Backend.FLASH_ATTN
+         return _Backend.TORCH_SDPA
  
      @classmethod
-     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-@@ -78,6 +96,10 @@ class XPUPlatform(Platform):
+diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
+index 32208e7ff..7a4b1679a 100644
+--- a/vllm/platforms/xpu.py
++++ b/vllm/platforms/xpu.py
+@@ -102,6 +102,10 @@ class XPUPlatform(Platform):
      def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
          return True
  
 +    @classmethod
-+    def get_piecewise_backend_cls(cls) -> str:
-+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
++    def get_vit_attn_backend(cls, head_size:int, support_fa: bool = False) -> _Backend:
++        return _Backend.IPEX
 +
      @classmethod
      def inference_mode(cls):
          return torch.no_grad()
-@@ -85,6 +107,7 @@ class XPUPlatform(Platform):
-     @classmethod
-     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-         cache_config = vllm_config.cache_config
-+        vllm_config.compilation_config.use_cudagraph = False
-         model_config = vllm_config.model_config
-         # in V1(or with ipex chunked prefill) block_size is 64
-         if cache_config and cache_config.block_size is None:
-@@ -173,6 +196,13 @@ class XPUPlatform(Platform):
-                 device_name)
-             return True
+@@ -127,6 +131,8 @@ class XPUPlatform(Platform):
+         if vllm_config.lora_config is not None:
+             compilation_config.level = CompilationLevel.NO_COMPILATION
  
-+    @classmethod
-+    def fp8_dtype(cls) -> torch.dtype:
++        if compilation_config.compile_sizes is None:
++            compilation_config.compile_sizes = {}
+         # check and update parallel config
+         parallel_config = vllm_config.parallel_config
+         parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+@@ -184,7 +190,10 @@ class XPUPlatform(Platform):
+ 
+     @classmethod
+     def fp8_dtype(cls) -> torch.dtype:
+-        return torch.float8_e5m2
 +        if envs.VLLM_XPU_FP8_DTYPE == "e4m3":
 +            return torch.float8_e4m3fn
 +        else:
 +            return torch.float8_e5m2
-+
+ 
      @classmethod
      def is_data_center_gpu(cls) -> bool:
-         device_name = cls.get_device_name().lower()
-diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
-index 51c78ddc1..93bf5ec25 100644
---- a/vllm/plugins/__init__.py
-+++ b/vllm/plugins/__init__.py
-@@ -68,13 +68,6 @@ def load_general_plugins():
-         return
-     plugins_loaded = True
- 
--    # some platform-specific configurations
--    from vllm.platforms import current_platform
--
--    if current_platform.is_xpu():
--        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
--        torch._dynamo.config.disable = True
--
-     plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
-     # general plugins, we only need to execute the loaded functions
-     for func in plugins.values():
-diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
-index e0ef7f099..d09c5fa92 100644
---- a/vllm/transformers_utils/chat_templates/registry.py
-+++ b/vllm/transformers_utils/chat_templates/registry.py
-@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
-     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
- 
- 
-+def _get_minicpmv_chat_template_fallback(
-+        tokenizer_name_or_path: str) -> Optional[Path]:
-+    # MiniCPM-V-4.5 version uses a dedicated template
-+    if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
-+        return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
-+
-+    # Other versions use chatml template
-+    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
-+
-+
- # yapf: disable
- _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
-     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
-@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
-     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
-     "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
-     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
-+    "minicpmv": _get_minicpmv_chat_template_fallback,
-     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
-     "qwen": _get_qwen_chat_template_fallback,
- }
-diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
-new file mode 100644
-index 000000000..661ebd1cf
---- /dev/null
-+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
-@@ -0,0 +1,93 @@
-+{%- set enable_thinking = enable_thinking | default(false) %}
-+{%- if tools %}
-+    {{- '<|im_start|>system\n' }}
-+    {%- if messages[0].role == 'system' %}
-+        {{- messages[0].content + '\n\n' }}
-+    {%- endif %}
-+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-+    {%- for tool in tools %}
-+        {{- "\n" }}
-+        {{- tool | tojson }}
-+    {%- endfor %}
-+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-+{%- else %}
-+    {%- if messages[0].role == 'system' %}
-+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-+    {%- endif %}
-+{%- endif %}
-+
-+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-+{%- for message in messages[::-1] %}
-+    {%- set index = (messages|length - 1) - loop.index0 %}
-+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-+        {%- set ns.multi_step_tool = false %}
-+        {%- set ns.last_query_index = index %}
-+    {%- endif %}
-+{%- endfor %}
-+
-+{%- for message in messages %}
-+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-+    {%- elif message.role == "assistant" %}
-+        {%- set content = message.content %}
-+        {%- set reasoning_content = '' %}
-+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-+            {%- set reasoning_content = message.reasoning_content %}
-+        {%- else %}
-+            {%- if '</think>' in message.content %}
-+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-+            {%- endif %}
-+        {%- endif %}
-+        {%- if loop.index0 > ns.last_query_index %}
-+            {%- if loop.last or (not loop.last and reasoning_content) %}
-+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-+            {%- else %}
-+                {{- '<|im_start|>' + message.role + '\n' + content }}
-+            {%- endif %}
-+        {%- else %}
-+            {{- '<|im_start|>' + message.role + '\n' + content }}
-+        {%- endif %}
-+
-+        {%- if message.tool_calls %}
-+            {%- for tool_call in message.tool_calls %}
-+                {%- if (loop.first and content) or (not loop.first) %}
-+                    {{- '\n' }}
-+                {%- endif %}
-+                {%- if tool_call.function %}
-+                    {%- set tool_call = tool_call.function %}
-+                {%- endif %}
-+                {{- '<tool_call>\n{"name": "' }}
-+                {{- tool_call.name }}
-+                {{- '", "arguments": ' }}
-+                {%- if tool_call.arguments is string %}
-+                    {{- tool_call.arguments }}
-+                {%- else %}
-+                    {{- tool_call.arguments | tojson }}
-+                {%- endif %}
-+                {{- '}\n</tool_call>' }}
-+            {%- endfor %}
-+        {%- endif %}
-+        {{- '<|im_end|>\n' }}
-+    {%- elif message.role == "tool" %}
-+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-+            {{- '<|im_start|>user' }}
-+        {%- endif %}
-+        {{- '\n<tool_response>\n' }}
-+        {{- message.content }}
-+        {{- '\n</tool_response>' }}
-+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-+            {{- '<|im_end|>\n' }}
-+        {%- endif %}
-+    {%- endif %}
-+{%- endfor %}
-+
-+{%- if add_generation_prompt %}
-+    {{- '<|im_start|>assistant\n' }}
-+    {%- if enable_thinking is defined and enable_thinking is false %}
-+        {{- '<think>\n\n</think>\n\n' }}
-+    {%- endif %}
-+    {%- if enable_thinking is defined and enable_thinking is true %}
-+        {{- '<think>\n' }}
-+    {%- endif %}
-+{%- endif %}
-\ No newline at end of file
-diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
-index 8d1f59e6e..0d96bcfef 100644
---- a/vllm/transformers_utils/config.py
-+++ b/vllm/transformers_utils/config.py
-@@ -264,7 +264,8 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
- 
- def uses_mrope(config: PretrainedConfig) -> bool:
-     """Detect if the model with this config uses M-ROPE."""
--    return _uses_mrope(config) or thinker_uses_mrope(config)
-+    return _uses_mrope(config) or _uses_mrope(
-+        config.get_text_config()) or thinker_uses_mrope(config)
- 
- 
- def thinker_uses_mrope(config: PretrainedConfig) -> bool:
-diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
+diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
+index cdae59ccc..bc78ad544 100644
+--- a/vllm/transformers_utils/configs/__init__.py
++++ b/vllm/transformers_utils/configs/__init__.py
+@@ -9,6 +9,7 @@ Model configs may be defined in this directory for the following reasons:
+ 
+ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+ from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
++from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
+ from vllm.transformers_utils.configs.eagle import EAGLEConfig
+ # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+@@ -34,6 +35,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+ __all__ = [
+     "ChatGLMConfig",
+     "DeepseekVLV2Config",
++    "DotsOCRConfig",
+     "EAGLEConfig",
+     "RWConfig",
+     "JAISConfig",
+diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py
 new file mode 100644
-index 000000000..0959d4a00
+index 000000000..6bb3c12d9
 --- /dev/null
-+++ b/vllm/utils/tensor_schema.py
-@@ -0,0 +1,236 @@
++++ b/vllm/transformers_utils/configs/dotsocr.py
+@@ -0,0 +1,69 @@
 +# SPDX-License-Identifier: Apache-2.0
 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
-+                    get_type_hints)
-+
-+import torch
++from typing import Any, Optional
 +
-+from vllm.logger import init_logger
-+
-+logger = init_logger(__name__)
-+
-+
-+class TensorShape:
-+
-+    def __init__(
-+        self,
-+        *dims: Union[int, str],
-+        dynamic_dims: Optional[set[str]] = None,
-+    ) -> None:
-+        super().__init__()
-+
-+        self.dims = dims
-+        self.dynamic_dims = dynamic_dims if dynamic_dims else set()
-+
-+    def resolve(self, **bindings: dict[str,
-+                                       int]) -> tuple[Union[int, str], ...]:
-+        resolved = []
-+        for dim in self.dims:
-+            if isinstance(dim, str) and dim in bindings:
-+                resolved.append(bindings[dim])
-+            else:
-+                resolved.append(dim)
-+        return tuple(resolved)
-+
-+    def __str__(self) -> str:
-+        """Return a string representation of the tensor shape."""
-+        dim_strs = []
-+        for dim in self.dims:
-+            if isinstance(dim, str):
-+                if dim in self.dynamic_dims:
-+                    dim_strs.append(
-+                        f"{dim}*")  # Mark dynamic dimensions with *
-+                else:
-+                    dim_strs.append(dim)
-+            else:
-+                dim_strs.append(str(dim))
-+        return f"({', '.join(dim_strs)})"
++from transformers.configuration_utils import PretrainedConfig
++from transformers.models.qwen2 import Qwen2Config
 +
 +
-+class TensorSchema:
++class DotsVisionConfig(PretrainedConfig):
++    model_type: str = "dots_vit"
 +
 +    def __init__(
 +        self,
-+        *,
-+        validate: bool = True,
-+        resolve_bindings: Optional[dict[str, int]] = None,
++        embed_dim: int = 1536,  # vision encoder embed size
++        hidden_size: int = 1536,  # after merger hidden size
++        intermediate_size: int = 4224,
++        num_hidden_layers: int = 42,
++        num_attention_heads: int = 12,
++        num_channels: int = 3,
++        patch_size: int = 14,
++        spatial_merge_size: int = 2,
++        temporal_patch_size: int = 1,
++        rms_norm_eps: float = 1e-5,
++        use_bias: bool = False,
++        attn_implementation="flash_attention_2",
++        initializer_range=0.02,
++        init_merger_std=0.02,
++        is_causal=False,  # ve causal forward
++        post_norm=True,
++        gradient_checkpointing=False,
 +        **kwargs: Any,
-+    ) -> None:
-+        super().__init__()
-+
-+        self._resolve_bindings = resolve_bindings if resolve_bindings else {}
-+
-+        for key, value in kwargs.items():
-+            setattr(self, key, value)
-+
-+        if validate:
-+            self.validate()
-+
-+    def __getitem__(self, key: str) -> Any:
-+        return getattr(self, key)
-+
-+    def get(self, key: str, default: Any = None) -> Any:
-+        return getattr(self, key, default)
-+
-+    def _match_shape_with_dynamic(
-+        self,
-+        actual: tuple[int, ...],
-+        reference: tuple[int, ...],
-+        expected_shape: tuple[Union[int, str], ...],
-+        dynamic_dims: set[str],
-+    ) -> bool:
-+        if len(actual) != len(reference) or len(actual) > len(expected_shape):
-+            return False
-+
-+        for i, (a, r) in enumerate(zip(actual, reference)):
-+            # When validating list inputs, we match shape suffixes only
-+            # (e.g. "p", 3, "h", "w"), assuming the list length corresponds
-+            # to the leading symbolic dim (e.g. "bn"). This allows comparing
-+            # only the trailing dimensions of each element in the list.
-+            dim = expected_shape[-len(actual) + i]
-+            # Skip this dimension if it's marked dynamic
-+            if dim in dynamic_dims:
-+                continue
-+            if a != r:
-+                return False
-+        return True
-+
-+    def _validate_nested_tensors(
-+        self,
-+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
-+        field_name: str,
-+        expected_shape: tuple[Union[int, str], ...],
-+        dynamic_dims: set[str],
-+    ) -> tuple[int, ...]:
-+        """Validate a list/tuple of tensors and return the actual shape."""
-+        # Ensure all tensors in the list have the same
-+        # shape, besides dynamic dimensions
-+        first = value[0]
-+        for i, v in enumerate(value):
-+            if not isinstance(v, torch.Tensor):
-+                raise ValueError(f"{field_name}[{i}] is not a "
-+                                 f"torch.Tensor")
-+            if not self._match_shape_with_dynamic(
-+                    v.shape,
-+                    first.shape,
-+                    expected_shape,
-+                    dynamic_dims,
-+            ):
-+                raise ValueError(f"{field_name} contains inconsistent "
-+                                 f"shapes: {first.shape} vs {v.shape} "
-+                                 f"at index {i}")
-+
-+        # Treat the list as a stacked tensor:
-+        # shape = (len(list), *tensor.shape)
-+        return (len(value), ) + first.shape
++    ):
++        super().__init__(**kwargs)
++        self.embed_dim = embed_dim
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        self.num_channels = num_channels
++        self.patch_size = patch_size
++        self.spatial_merge_size = spatial_merge_size
++        self.temporal_patch_size = temporal_patch_size
++        self.rms_norm_eps = rms_norm_eps
++        self.use_bias = use_bias
++        self.attn_implementation = attn_implementation
++        self.initializer_range = initializer_range
++        self.init_merger_std = init_merger_std
++        self.is_causal = is_causal
++        self.post_norm = post_norm
++        self.gradient_checkpointing = gradient_checkpointing
 +
-+    def _validate_tensor_shape_expected(
-+        self,
-+        actual_shape: tuple[int, ...],
-+        expected_shape: tuple[Union[int, str], ...],
-+        field_name: str,
-+        shape_env: dict[str, int],
-+        dynamic_dims: set[str],
-+    ) -> None:
-+        """Validate that the actual tensor shape matches the expected shape."""
 +
-+        if len(actual_shape) != len(expected_shape):
-+            raise ValueError(f"{field_name} has rank {len(actual_shape)} "
-+                             f"but expected {len(expected_shape)}")
++class DotsOCRConfig(Qwen2Config):
++    model_type = "dots_ocr"
 +
-+        for i, dim in enumerate(expected_shape):
-+            if dim in dynamic_dims:
-+                continue
-+            elif isinstance(dim, int):
-+                if actual_shape[i] != dim:
-+                    raise ValueError(f"{field_name} dim[{i}] expected "
-+                                     f"{dim}, got {actual_shape[i]}")
-+            elif isinstance(dim, str):
-+                if dim in shape_env:
-+                    if actual_shape[i] != shape_env[dim]:
-+                        raise ValueError(f"{field_name} dim[{i}] expected "
-+                                         f"'{dim}'={shape_env[dim]}, got "
-+                                         f"{actual_shape[i]}")
-+                else:
-+                    shape_env[dim] = actual_shape[i]
-+            else:
-+                raise TypeError(f"{field_name} dim[{i}] has unsupported "
-+                                f"type: {type(dim)}")
-+
-+    def validate(self) -> None:
-+        type_hints = get_type_hints(self.__class__, include_extras=True)
-+        shape_env = {}
-+
-+        for field_name, field_type in type_hints.items():
-+            # Check if field is missing
-+            if (not hasattr(self, field_name)
-+                    or getattr(self, field_name) is None):
-+                # Check if field is marked as optional
-+                actual_type = field_type
-+                if get_origin(field_type) is Annotated:
-+                    args = get_args(field_type)
-+                    actual_type = args[0]
-+
-+                # Check arg was provided as Union
-+                if get_origin(actual_type) is Union:
-+                    args = get_args(actual_type)
-+                    # Skip validation when Union contains None
-+                    if type(None) in args:
-+                        continue
-+                # Otherwise field is required, raise error
-+                raise ValueError(f"Required field '{field_name}' is missing")
-+
-+            # Field exists, proceed with validation
-+            value = getattr(self, field_name)
-+            if get_origin(field_type) is not None:
-+                args = get_args(field_type)
-+
-+                for arg in args:
-+                    if isinstance(arg, TensorShape):
-+                        expected_shape = arg.resolve(**self._resolve_bindings)
-+                        if isinstance(value, (list, tuple)):
-+                            # list/tuple of Tensors → shape = (len(value), ...)
-+                            if value and isinstance(value[0], torch.Tensor):
-+                                actual_shape = self._validate_nested_tensors(
-+                                    value, field_name, expected_shape,
-+                                    arg.dynamic_dims)
-+                            elif value:
-+                                # list/tuple of scalars → shape = (len(value),)
-+                                actual_shape = (len(value), )
-+                            else:
-+                                raise ValueError(
-+                                    f"{field_name} is an empty list")
-+
-+                        # Tensor → shape = tensor.shape
-+                        elif isinstance(value, torch.Tensor):
-+                            actual_shape = value.shape
-+
-+                        # Otherwise, it's an unsupported type
-+                        else:
-+                            type_names = []
-+                            for arg in args:
-+                                if hasattr(arg, "__name__"):
-+                                    type_names.append(str(arg.__name__))
-+                                else:
-+                                    type_names.append(str(arg))
-+
-+                            expected_types = ", ".join(type_names)
-+                            raise ValueError(
-+                                f"{field_name} is not one of the expected "
-+                                f"types: {expected_types}")
-+
-+                        self._validate_tensor_shape_expected(
-+                            actual_shape, expected_shape, field_name,
-+                            shape_env, arg.dynamic_dims)
-+
-+    def print_shapes(self) -> None:
-+        """Print TensorShape annotations for debugging."""
-+        logger.debug("Shapes in %s:", self.__class__.__name__)
-+        type_hints = get_type_hints(self.__class__, include_extras=True)
-+
-+        for field_name, field_type in type_hints.items():
-+            if get_origin(field_type) is not None:
-+                args = get_args(field_type)
-+                for arg in args:
-+                    if isinstance(arg, TensorShape):
-+                        logger.debug("  %s: %s", field_name, str(arg))
-\ No newline at end of file
++    def __init__(self,
++                 image_token_id=151665,
++                 video_token_id=151656,
++                 vision_config: Optional[dict] = None,
++                 *args,
++                 **kwargs):
++        super().__init__(*args, **kwargs)
++        self.image_token_id = image_token_id
++        self.video_token_id = video_token_id
++        self.vision_config = DotsVisionConfig(**(vision_config or {}))
++
++    def save_pretrained(self, save_directory, **kwargs):
++        self._auto_class = None
++        super().save_pretrained(save_directory, **kwargs)
 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
-index 5fe274f2c..4a8657ee5 100755
+index 20f1904b3..c02d74145 100755
 --- a/vllm/v1/attention/backends/flash_attn.py
 +++ b/vllm/v1/attention/backends/flash_attn.py
-@@ -24,12 +24,15 @@ if is_flash_attn_varlen_func_available():
+@@ -24,6 +24,7 @@ if is_flash_attn_varlen_func_available():
  
  from vllm.config import VllmConfig, get_layers_from_vllm_config
  from vllm.logger import init_logger
 +from vllm.platforms import current_platform
  from vllm.utils import cdiv
- from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                               CommonAttentionMetadata,
+ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                               AttentionMetadataBuilder,
+@@ -31,6 +32,8 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                                get_kv_cache_layout)
  from vllm.v1.kv_cache_interface import AttentionSpec
  
@@ -16550,7 +7992,7 @@ index 5fe274f2c..4a8657ee5 100755
  logger = init_logger(__name__)
  
  # NOTE(woosuk): This is an arbitrary number. Tune it if needed.
-@@ -46,7 +49,7 @@ class FlashAttentionBackend(AttentionBackend):
+@@ -47,7 +50,7 @@ class FlashAttentionBackend(AttentionBackend):
  
      @classmethod
      def get_supported_head_sizes(cls) -> list[int]:
@@ -16559,47 +8001,25 @@ index 5fe274f2c..4a8657ee5 100755
  
      @classmethod
      def validate_head_size(cls, head_size: int) -> None:
-@@ -125,11 +128,16 @@ class FlashAttentionMetadata:
-     prefix_kv_lens: Optional[torch.Tensor]
-     suffix_kv_lens: Optional[torch.Tensor]
- 
-+    # For XPU.
-+    seq_start_loc: Optional[torch.Tensor]
-+
-     # Optional aot scheduling
+@@ -137,6 +140,8 @@ class FlashAttentionMetadata:
      scheduler_metadata: Optional[torch.Tensor] = None
      prefix_scheduler_metadata: Optional[torch.Tensor] = None
      max_num_splits: int = 0
++    # For XPU.
++    seq_start_loc: Optional[torch.Tensor] = None
  
-+    causal: bool = True
-+
+     causal: bool = True
  
- def _get_sliding_window_configs(
-         vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
-@@ -209,10 +217,13 @@ class FlashAttentionMetadataBuilder(
+@@ -234,6 +239,8 @@ class FlashAttentionMetadataBuilder(
          max_query_len = common_attn_metadata.max_query_len
-         max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+         max_seq_len = common_attn_metadata.max_seq_len
          query_start_loc = common_attn_metadata.query_start_loc
 +        seq_start_loc = common_attn_metadata.seq_start_loc \
 +            if current_platform.is_xpu() else None
          seq_lens = common_attn_metadata.seq_lens
          seq_lens_cpu = common_attn_metadata.seq_lens_cpu
          block_table_tensor = common_attn_metadata.block_table_tensor
-         slot_mapping = common_attn_metadata.slot_mapping
-+        causal = common_attn_metadata.causal
- 
-         # the overhead of the aot schedule is not worth it for spec-decode
-         aot_schedule = self.aot_schedule and not fast_build
-@@ -288,7 +299,7 @@ class FlashAttentionMetadataBuilder(
-                                           max_query_len=max_query_len,
-                                           seqlens=seq_lens,
-                                           max_seq_len=max_seq_len,
--                                          causal=True)
-+                                          causal=causal)
- 
-         if self.use_full_cuda_graph:
-             assert scheduler_metadata is not None
-@@ -314,6 +325,7 @@ class FlashAttentionMetadataBuilder(
+@@ -345,6 +352,7 @@ class FlashAttentionMetadataBuilder(
              num_actual_tokens=num_actual_tokens,
              max_query_len=max_query_len,
              query_start_loc=query_start_loc,
@@ -16607,74 +8027,34 @@ index 5fe274f2c..4a8657ee5 100755
              max_seq_len=max_seq_len,
              seq_lens=seq_lens,
              block_table=block_table_tensor,
-@@ -326,7 +338,7 @@ class FlashAttentionMetadataBuilder(
-             suffix_kv_lens=suffix_kv_lens,
-             prefix_scheduler_metadata=prefix_scheduler_metadata,
-             max_num_splits=max_num_splits,
--        )
-+            causal=causal)
-         return attn_metadata
- 
-     def can_run_in_cudagraph(
-@@ -375,11 +387,14 @@ class FlashAttentionImpl(AttentionImpl):
+@@ -413,8 +421,6 @@ class FlashAttentionImpl(AttentionImpl):
  
-         FlashAttentionBackend.validate_head_size(head_size)
- 
--        if attn_type != AttentionType.DECODER:
--            raise NotImplementedError("Encoder self-attention and "
--                                      "encoder/decoder cross-attention "
--                                      "are not implemented for "
-+        if attn_type not in [
-+                AttentionType.DECODER, AttentionType.ENCODER_ONLY
-+        ]:
-+            raise NotImplementedError("Encoder/decoder cross-attention "
-+                                      "is not implemented for "
-                                       "FlashAttentionImpl")
-+
-+        self.attn_type = attn_type
-         self.vllm_flash_attn_version = get_flash_attn_version()
-         if is_quantized_kv_cache(self.kv_cache_dtype) \
-             and not flash_attn_supports_fp8():
-@@ -420,7 +435,9 @@ class FlashAttentionImpl(AttentionImpl):
+         self.sinks = sinks
+         if self.sinks is not None:
+-            assert self.vllm_flash_attn_version == 3, (
+-                "Sinks are only supported in FlashAttention 3")
+             assert self.sinks.shape[0] == num_heads, (
+                 "Sinks must have the same number of heads as the number of "
+                 "heads in the layer")
+@@ -455,7 +461,7 @@ class FlashAttentionImpl(AttentionImpl):
  
          if attn_metadata is None:
              # Profiling run.
 -            return output
 +            return output.uniform_()
-+
-+        attn_type = self.attn_type
  
-         # IMPORTANT!
-         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
-@@ -432,6 +449,18 @@ class FlashAttentionImpl(AttentionImpl):
-         # performance to make sure it does not introduce any overhead.
+         attn_type = self.attn_type
  
-         num_actual_tokens = attn_metadata.num_actual_tokens
-+
-+        # Handle encoder attention differently - no KV cache needed
-+        if attn_type in (AttentionType.ENCODER_ONLY, ):
-+            # For encoder attention,
-+            # we use direct Q, K, V tensors without caching
-+            return self._forward_encoder_attention(query[:num_actual_tokens],
-+                                                   key[:num_actual_tokens],
-+                                                   value[:num_actual_tokens],
-+                                                   output[:num_actual_tokens],
-+                                                   attn_metadata, layer)
-+
-+        # For decoder and cross-attention, use KV cache as before
-         key_cache, value_cache = kv_cache.unbind(0)
+@@ -528,6 +534,8 @@ class FlashAttentionImpl(AttentionImpl):
  
-         if self.kv_sharing_target_layer_name is None:
-@@ -472,6 +501,8 @@ class FlashAttentionImpl(AttentionImpl):
-             scheduler_metadata = attn_metadata.scheduler_metadata
+             descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
  
-             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 +            cu_seqlens_k = attn_metadata.seq_start_loc if \
 +                current_platform.is_xpu() else None
- 
              flash_attn_varlen_func(
                  q=query[:num_actual_tokens],
-@@ -480,10 +511,11 @@ class FlashAttentionImpl(AttentionImpl):
+                 k=key_cache,
+@@ -535,6 +543,7 @@ class FlashAttentionImpl(AttentionImpl):
                  out=output[:num_actual_tokens],
                  cu_seqlens_q=cu_seqlens_q,
                  max_seqlen_q=max_seqlen_q,
@@ -16682,49 +8062,10 @@ index 5fe274f2c..4a8657ee5 100755
                  seqused_k=seqused_k,
                  max_seqlen_k=max_seqlen_k,
                  softmax_scale=self.scale,
--                causal=True,
-+                causal=attn_metadata.causal,
-                 alibi_slopes=self.alibi_slopes,
-                 window_size=self.sliding_window,
-                 block_table=block_table,
-@@ -524,6 +556,86 @@ class FlashAttentionImpl(AttentionImpl):
-         )
-         return output
+@@ -614,6 +623,29 @@ class FlashAttentionImpl(AttentionImpl):
+             cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
+             self.num_kv_heads)
  
-+    def _forward_encoder_attention(
-+        self,
-+        query: torch.Tensor,
-+        key: torch.Tensor,
-+        value: torch.Tensor,
-+        output: torch.Tensor,
-+        attn_metadata: FlashAttentionMetadata,
-+        layer: torch.nn.Module,
-+    ) -> torch.Tensor:
-+        """Forward pass for encoder attention without KV cache.
-+
-+        Args:
-+            query: shape = [num_encoder_tokens, num_heads, head_size]
-+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
-+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
-+            output: shape = [num_encoder_tokens, num_heads, head_size]
-+            attn_metadata: Encoder attention metadata
-+            layer: The attention layer
-+        """
-+        # For encoder attention, process FP8 quantization if needed
-+        if self.kv_cache_dtype.startswith("fp8"):
-+            raise NotImplementedError(
-+                "quantization is not supported for encoder attention")
-+
-+        # Use encoder-specific metadata for sequence information
-+        cu_seqlens_q = attn_metadata.query_start_loc
-+        cu_seqlens_k = attn_metadata.query_start_loc
-+        max_seqlen_q = attn_metadata.max_query_len
-+        max_seqlen_k = attn_metadata.max_query_len
-+
-+        descale_shape = (
-+            cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
-+            self.num_kv_heads)
-+
 +        if current_platform.is_xpu():
 +            ipex_ops.varlen_attention(
 +                    query=query,
@@ -16748,37 +8089,14 @@ index 5fe274f2c..4a8657ee5 100755
 +                    )
 +            return output
 +
-+        # Call flash attention directly on Q, K, V tensors
-+        flash_attn_varlen_func(
-+            q=query,
-+            k=key,
-+            v=value,
-+            out=output,
-+            cu_seqlens_q=cu_seqlens_q,
-+            cu_seqlens_k=cu_seqlens_k,
-+            max_seqlen_q=max_seqlen_q,
-+            max_seqlen_k=max_seqlen_k,
-+            softmax_scale=self.scale,
-+            causal=False,  # Encoder attention is bidirectional
-+            alibi_slopes=self.alibi_slopes,
-+            window_size=self.sliding_window,
-+            softcap=self.logits_soft_cap,
-+            fa_version=self.vllm_flash_attn_version,
-+            q_descale=layer._q_scale.expand(descale_shape),
-+            k_descale=layer._k_scale.expand(descale_shape),
-+            v_descale=layer._v_scale.expand(descale_shape),
-+        )
-+
-+        return output
-+
- 
- def use_cascade_attention(
-     common_prefix_len: int,
+         # Call flash attention directly on Q, K, V tensors
+         flash_attn_varlen_func(
+             q=query,
 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
-index fc8649d58..b7d1b84cc 100644
+index 009943fa7..647b3e88a 100644
 --- a/vllm/v1/attention/backends/utils.py
 +++ b/vllm/v1/attention/backends/utils.py
-@@ -41,6 +41,9 @@ class CommonAttentionMetadata:
+@@ -46,6 +46,9 @@ class CommonAttentionMetadata:
      query_start_loc_cpu: torch.Tensor
      """(batch_size + 1,), the start location of each request in query Tensor"""
  
@@ -16788,16 +8106,7 @@ index fc8649d58..b7d1b84cc 100644
      seq_lens: torch.Tensor
      seq_lens_cpu: torch.Tensor
      """(batch_size,), the length of each request including both computed tokens
-@@ -59,6 +62,8 @@ class CommonAttentionMetadata:
-     block_table_tensor: torch.Tensor
-     slot_mapping: torch.Tensor
- 
-+    causal: bool = True
-+
- 
- M = TypeVar("M")
- 
-@@ -387,6 +392,8 @@ def make_local_attention_virtual_batches(
+@@ -566,6 +569,8 @@ def make_local_attention_virtual_batches(
          query_start_loc_cpu=query_start_loc_cpu,
          query_start_loc=query_start_loc_cpu.to(device=device,
                                                 non_blocking=True),
@@ -16806,68 +8115,24 @@ index fc8649d58..b7d1b84cc 100644
          seq_lens_cpu=seq_lens_cpu,
          seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
          num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
-@@ -395,6 +402,7 @@ def make_local_attention_virtual_batches(
-         max_query_len=seqlens_q_local.max(),
-         block_table_tensor=block_table_local,
-         slot_mapping=common_attn_metadata.slot_mapping,
-+        causal=True,
-     )
- 
- 
-diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
-index 7779b559c..346638da3 100644
---- a/vllm/v1/engine/core.py
-+++ b/vllm/v1/engine/core.py
-@@ -109,6 +109,12 @@ class EngineCore:
-                 "compatibility may not be maintained.",
-                 vllm_config.scheduler_config.scheduler_cls)
- 
-+        if len(kv_cache_config.kv_cache_groups) == 0:
-+            # Encoder models without KV cache don't support
-+            # chunked prefill. But do SSM models?
-+            logger.info("Disabling chunked prefill for model without KVCache")
-+            vllm_config.scheduler_config.chunked_prefill_enabled = False
-+
-         self.scheduler: SchedulerInterface = Scheduler(
-             vllm_config=vllm_config,
-             kv_cache_config=kv_cache_config,
-diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
-index 50b9634a4..1649b8ff7 100644
---- a/vllm/v1/executor/abstract.py
-+++ b/vllm/v1/executor/abstract.py
-@@ -30,6 +30,7 @@ class Executor(ExecutorBase):
-         parallel_config = vllm_config.parallel_config
-         distributed_executor_backend = (
-             parallel_config.distributed_executor_backend)
-+        data_parallel_size = parallel_config.data_parallel_size
-         # distributed_executor_backend must be set in VllmConfig.__post_init__
-         if isinstance(distributed_executor_backend, type):
-             if not issubclass(distributed_executor_backend, ExecutorBase):
-diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
-index b86ac048f..277c16ff8 100644
---- a/vllm/v1/executor/ray_distributed_executor.py
-+++ b/vllm/v1/executor/ray_distributed_executor.py
-@@ -11,6 +11,8 @@ from vllm.logger import init_logger
- from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
- from vllm.v1.executor.abstract import Executor
- from vllm.v1.outputs import ModelRunnerOutput
-+import ray
-+import cloudpickle
+diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
+index 2c0eac3dd..c4a229b78 100644
+--- a/vllm/v1/core/kv_cache_utils.py
++++ b/vllm/v1/core/kv_cache_utils.py
+@@ -1048,7 +1048,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+         kv_cache_spec: The kv cache spec of each attention layer in the model
+     """
  
- logger = init_logger(__name__)
+-    if is_kv_cache_type_uniform(kv_cache_spec):
++    if not kv_cache_spec or is_kv_cache_type_uniform(kv_cache_spec):
+         return
  
-@@ -103,4 +105,4 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
-         if reconfig_request.new_data_parallel_rank == \
-         ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
-             self.shutdown()
--        return
-\ No newline at end of file
-+        return
+     logger.warning(
 diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
-index 967847c02..80c103829 100644
+index 7132d507c..a9ade77ea 100644
 --- a/vllm/v1/spec_decode/eagle.py
 +++ b/vllm/v1/spec_decode/eagle.py
-@@ -322,6 +322,8 @@ class EagleProposer:
+@@ -595,6 +595,8 @@ class EagleProposer:
                                                         non_blocking=True),
              seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
              query_start_loc_cpu=new_query_start_loc_cpu,
@@ -16876,89 +8141,36 @@ index 967847c02..80c103829 100644
              seq_lens_cpu=new_seq_lens_cpu,
              num_computed_tokens_cpu=common_attn_metadata.
              num_computed_tokens_cpu,
-@@ -330,6 +332,7 @@ class EagleProposer:
-             max_query_len=new_query_len_per_req.max().item(),
-             block_table_tensor=common_attn_metadata.block_table_tensor,
-             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
-+            causal=True,
-         )
- 
-         return spec_common_attn_metadata, token_indices
-diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
-index ca94ac8c6..6b2b50a57 100644
---- a/vllm/v1/worker/cpu_model_runner.py
-+++ b/vllm/v1/worker/cpu_model_runner.py
-@@ -4,6 +4,7 @@ from contextlib import contextmanager
- from typing import Any
- 
- import torch
-+import torch.nn as nn
- 
- from vllm.config import VllmConfig
- from vllm.logger import init_logger
-@@ -59,6 +60,9 @@ class CPUModelRunner(GPUModelRunner):
-                                               self.scheduler_config,
-                                               self.lora_config, self.device)
- 
-+    def get_model(self) -> nn.Module:
-+        return self.model
-+
-     def warming_up_model(self) -> None:
-         logger.info("Warming up model for the compilation...")
-         # Only generate graph for the generic shape
+diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
+index 5e00f6380..c296b3f28 100644
+--- a/vllm/v1/structured_output/backend_xgrammar.py
++++ b/vllm/v1/structured_output/backend_xgrammar.py
+@@ -108,7 +108,9 @@ class XgrammarBackend(StructuredOutputBackend):
+                     end=s["end"],
+                 ) for s in s_tag["structures"]
+             ]
+-            ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
++            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
++                tags, s_tag["triggers"])
++            ctx = self.compiler.compile_structural_tag(structural_tag)
+         else:
+             logger.error(
+                 "Validation should have already occurred. Please file an issue."
+@@ -318,6 +320,8 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+                     end=s["end"],
+                 ) for s in s_tag["structures"]
+             ]
+-            xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
++            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
++                tags, s_tag["triggers"])
++            xgr.Grammar.from_structural_tag(structural_tag)
+         except Exception as e:
+             raise ValueError("Invalid structural tag specification.") from e
 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
-index a5bf197ba..f17c2fa1b 100644
+index ebb18e81c..822fa16bb 100644
 --- a/vllm/v1/worker/gpu_model_runner.py
 +++ b/vllm/v1/worker/gpu_model_runner.py
-@@ -1,7 +1,6 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- 
--import copy
- import gc
- import time
- from contextlib import contextmanager
-@@ -23,12 +22,10 @@ from vllm.config import (CompilationLevel, VllmConfig,
- from vllm.distributed.eplb.eplb_state import EplbState
- from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-                                           has_kv_transfer_group)
--from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
- from vllm.distributed.parallel_state import (
-     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
-     prepare_communication_buffer_for_model)
--from vllm.forward_context import (DPMetadata, get_forward_context,
--                                  set_forward_context)
-+from vllm.forward_context import DPMetadata, set_forward_context
- from vllm.logger import init_logger
- from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
- from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-@@ -66,6 +63,8 @@ from vllm.v1.spec_decode.medusa import MedusaProposer
- from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
- from vllm.v1.spec_decode.ngram_proposer import NgramProposer
- from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
-+from vllm.v1.worker.kv_connector_model_runner_mixin import (
-+    KVConnectorModelRunnerMixin)
- from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
- 
- from ..sample.logits_processor import LogitsProcessorManager
-@@ -88,7 +87,7 @@ else:
- logger = init_logger(__name__)
- 
- 
--class GPUModelRunner(LoRAModelRunnerMixin):
-+class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
- 
-     def __init__(
-         self,
-@@ -125,6 +124,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
- 
-         self.is_multimodal_model = model_config.is_multimodal_model
-         self.is_pooling_model = model_config.pooler_config is not None
-+        self.is_encoder_only_model = False
-         self.model_supports_multimodal_raw_input = (
-             model_config.model_supports_multimodal_raw_input)
-         self.max_model_len = model_config.max_model_len
-@@ -177,6 +177,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+@@ -246,6 +246,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
          # NOTE(Jiayi): currently we put the entire draft model on
          # the last PP rank. This is not ideal if there are many
          # layers in the draft model.
@@ -16968,9 +8180,9 @@ index a5bf197ba..f17c2fa1b 100644
          if self.speculative_config and get_pp_group().is_last_rank:
              if self.speculative_config.method == "ngram":
                  self.drafter = NgramProposer(self.vllm_config)
-@@ -313,6 +316,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                                         pin_memory=self.pin_memory)
-         self.seq_lens_np = self.seq_lens_cpu.numpy()
+@@ -362,6 +365,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                                        self.max_num_tokens),
+                                    dtype=np.int64)
  
 +        # this is XPU specific
 +        self.seq_start_loc = torch.zeros(self.max_num_reqs + 1,
@@ -16985,146 +8197,69 @@ index a5bf197ba..f17c2fa1b 100644
          # Layer pairings for cross-layer KV sharing.
          # If an Attention layer `layer_name` is in the keys of this dict, it
          # means this layer will perform attention using the keys and values
-@@ -703,6 +716,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-             num_scheduled_tokens)
+@@ -944,6 +957,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+         seq_lens = self.seq_lens.gpu[:num_reqs]
+         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
  
 +        # for xpu
-+        seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-+                    num_scheduled_tokens)
 +        self.seq_start_loc_np[0] = 0
-+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
++        np.cumsum(self.seq_lens.np[:num_reqs],
++                  out=self.seq_start_loc_np[1:num_reqs + 1])
 +        self.seq_start_loc[:num_reqs + 1].copy_(
 +            self.seq_start_loc_cpu[:num_reqs + 1], non_blocking=True)
 +
          # Copy the tensors to the GPU.
-         self.input_ids[:total_num_scheduled_tokens].copy_(
-             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-@@ -734,6 +755,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         spec_decode_common_attn_metadata = None
+         self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+ 
+@@ -967,6 +987,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             logits_indices = query_start_loc[1:] - 1
+             num_draft_tokens = None
+             spec_decode_metadata = None
++            self.num_draft_tokens.gpu = None
++            self.num_accepted_tokens.gpu = None
+         else:
+             # Get the number of draft tokens for each request.
+             # Iterate over the dictionary rather than all requests since not all
+@@ -982,6 +1004,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             logits_indices = spec_decode_metadata.logits_indices
+             self.num_draft_tokens.np[:num_reqs] = num_draft_tokens
+             self.num_draft_tokens.np[num_reqs:].fill(0)
++            if self.num_draft_tokens.gpu is None:
++                self.num_draft_tokens.gpu = self.num_draft_tokens.cpu.to(self.device)
+             self.num_draft_tokens.copy_to_gpu()
+ 
+         logits_indices_padded = None
+@@ -1001,6 +1025,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             self.num_accepted_tokens.np[:num_reqs] = (
+                 self.input_batch.num_accepted_tokens_cpu[:num_reqs])
+             self.num_accepted_tokens.np[num_reqs:].fill(1)
++            if self.num_accepted_tokens.gpu is None:
++                self.num_accepted_tokens.gpu = self.num_accepted_tokens.cpu.to(self.device)
+             self.num_accepted_tokens.copy_to_gpu()
  
-         attn_metadata: dict[str, Any] = {}
-+
-+        # Prepare encoder attention metadata separately
-+        # (encoder layers are not in KV cache groups)
-+        if self.is_encoder_only_model:
-+            common_attn_metadata, encoder_attn_metadata = \
-+                self._build_encoder_only_attn_metadata(
-+                scheduler_output)
-+
-+            # Add encoder attention metadata for all encoder layers
-+            attention_layers = get_layers_from_vllm_config(
-+                self.vllm_config, Attention)
-+            for layer_name, attn_module in attention_layers.items():
-+                if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-+                    attn_metadata[layer_name] = encoder_attn_metadata
-+
          # Prepare the attention metadata for each KV cache group and make layers
-         # in the same group share the same metadata.
-         for kv_cache_group_id, kv_cache_group_spec in enumerate(
-@@ -750,6 +786,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+@@ -1041,6 +1067,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
              common_attn_metadata = CommonAttentionMetadata(
-                 query_start_loc=self.query_start_loc[:num_reqs + 1],
-                 query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+                 query_start_loc=query_start_loc,
+                 query_start_loc_cpu=query_start_loc_cpu,
 +                seq_start_loc=self.seq_start_loc[:num_reqs + 1],
 +                seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-                 seq_lens=self.seq_lens[:num_reqs],
-                 seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-                 num_computed_tokens_cpu=self.input_batch.
-@@ -759,6 +797,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                 max_query_len=max_num_scheduled_tokens,
-                 block_table_tensor=blk_table_tensor,
-                 slot_mapping=slot_mapping,
-+                causal=True,
-             )
- 
-             if self.speculative_config and \
-@@ -1357,7 +1396,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                 # Return empty ModelRunnerOutput if there's no work to do.
-                 return EMPTY_MODEL_RUNNER_OUTPUT
- 
--            return self.kv_connector_no_forward(scheduler_output)
-+            return self.kv_connector_no_forward(scheduler_output,
-+                                                self.vllm_config)
- 
-         # Prepare the decoder inputs.
-         (attn_metadata, attention_cuda_graphs, logits_indices,
-@@ -1745,52 +1785,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-             spec_token_ids = draft_token_ids.tolist()
-         return spec_token_ids
- 
--    @staticmethod
--    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
--        # Update KVConnector with the KVConnector metadata forward().
--        if has_kv_transfer_group():
--            kv_connector = get_kv_transfer_group()
--            assert isinstance(kv_connector, KVConnectorBase_V1)
--            assert scheduler_output.kv_connector_metadata is not None
--            kv_connector.bind_connector_metadata(
--                scheduler_output.kv_connector_metadata)
--
--            # Background KV cache transfers happen here.
--            # These transfers are designed to be async and the requests
--            # involved may be disjoint from the running requests.
--            # Do this here to save a collective_rpc.
--            kv_connector.start_load_kv(get_forward_context())
--
--    @staticmethod
--    def maybe_wait_for_kv_save() -> None:
--        if has_kv_transfer_group():
--            get_kv_transfer_group().wait_for_save()
--
--    @staticmethod
--    def get_finished_kv_transfers(
--        scheduler_output: "SchedulerOutput",
--    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
--        if has_kv_transfer_group():
--            return get_kv_transfer_group().get_finished(
--                scheduler_output.finished_req_ids)
--        return None, None
--
--    def kv_connector_no_forward(
--            self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
--        # KV send/recv even if no work to do.
--        with set_forward_context(None, self.vllm_config):
--            self.maybe_setup_kv_connector(scheduler_output)
--            finished_sending, finished_recving = (
--                self.get_finished_kv_transfers(scheduler_output))
--
--        if not finished_sending and not finished_recving:
--            return EMPTY_MODEL_RUNNER_OUTPUT
--
--        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
--        output.finished_sending = finished_sending
--        output.finished_recving = finished_recving
--        return output
--
-     def propose_ngram_draft_token_ids(
-         self,
-         sampled_token_ids: list[list[int]],
-@@ -2111,6 +2105,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     query_start_loc=self.query_start_loc[:num_reqs + 1],
-                     query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
+                 seq_lens=seq_lens,
+                 seq_lens_cpu=seq_lens_cpu,
+                 num_computed_tokens_cpu=num_computed_tokens_cpu,
+@@ -2734,6 +2762,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
+                     query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
                                                                   1],
 +                    seq_start_loc=self.seq_start_loc[:num_reqs + 1],
 +                    seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-                     seq_lens=self.seq_lens[:num_reqs],
-                     seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                     seq_lens=self.seq_lens.gpu[:num_reqs],
+                     seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
                      num_computed_tokens_cpu=self.input_batch.
-@@ -2121,7 +2117,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     block_table_tensor=self.input_batch.block_table[
-                         kv_cache_group_id].get_device_tensor()[:num_reqs],
-                     slot_mapping=self.input_batch.
--                    block_table[kv_cache_group_id].slot_mapping[:num_tokens])
-+                    block_table[kv_cache_group_id].slot_mapping[:num_tokens],
-+                    causal=True)
- 
-                 attn_metadata_i = self.attn_metadata_builders[
-                     kv_cache_group_id].build_for_cudagraph_capture(
-@@ -2410,11 +2407,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
- 
-             # Cache the dummy encoder outputs.
-             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+@@ -3025,11 +3055,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     # Cache the dummy encoder outputs.
+                     self.encoder_cache["tmp"] = dict(
+                         enumerate(dummy_encoder_outputs))
 -
          # Add `is_profile` here to pre-allocate communication buffers
          hidden_states, last_hidden_states \
@@ -17134,210 +8269,11 @@ index a5bf197ba..f17c2fa1b 100644
              if self.is_pooling_model:
                  output = self._dummy_pooler_run(hidden_states)
              else:
-@@ -2485,6 +2481,49 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
-                     elapsed_time, cuda_graph_size / (1 << 30))
- 
-+    def _initialize_single_attn_backend(
-+        self, kv_cache_spec: KVCacheSpec
-+    ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
-+        if isinstance(kv_cache_spec, AttentionSpec):
-+            attn_backend_i = get_attn_backend(
-+                kv_cache_spec.head_size,
-+                self.dtype,
-+                kv_cache_spec.dtype,
-+                kv_cache_spec.block_size,
-+                self.model_config.is_attention_free,
-+                use_mla=kv_cache_spec.use_mla,
-+            )
-+            if attn_backend_i is None:
-+                error_msg = (f"Error with get_attn_backend: "
-+                             f"{kv_cache_spec.head_size=}, "
-+                             f"{self.dtype=}, {kv_cache_spec.dtype=}, "
-+                             f"{kv_cache_spec.block_size=}, "
-+                             f"{self.model_config.is_attention_free=}, "
-+                             f"{kv_cache_spec.use_mla=}")
-+                logger.error(error_msg)
-+                raise NotImplementedError(
-+                    "Non-Attention backend is not supported by V1 "
-+                    "GPUModelRunner.")
-+        elif isinstance(kv_cache_spec, MambaSpec):
-+            attn_backend_i = Mamba2AttentionBackend
-+        else:
-+            raise ValueError(
-+                f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-+
-+        attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-+            kv_cache_spec,
-+            self.vllm_config,
-+            self.device,
-+        )
-+
-+        if (self.full_cuda_graph
-+                and not attn_metadata_builder_i.full_cudagraph_supported):
-+            raise ValueError(
-+                f"Full CUDAGraph not supported for "
-+                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-+                f"full_cuda_graph or use a different attention backend.")
-+        return attn_backend_i, attn_metadata_builder_i
-+
-     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
-         """
-         Initialize the attention backends and attention metadata builders.
-@@ -2495,48 +2534,45 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         for i, kv_cache_group_spec in enumerate(
-                 kv_cache_config.kv_cache_groups):
-             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
--            if isinstance(kv_cache_spec, AttentionSpec):
--                attn_backend_i = get_attn_backend(
--                    kv_cache_spec.head_size,
--                    self.dtype,
--                    kv_cache_spec.dtype,
--                    kv_cache_spec.block_size,
--                    self.model_config.is_attention_free,
--                    use_mla=kv_cache_spec.use_mla,
--                )
--                if attn_backend_i is None:
--                    error_msg = (f"Error with get_attn_backend: "
--                                 f"{kv_cache_spec.head_size=}, "
--                                 f"{self.dtype=}, {kv_cache_spec.dtype=}, "
--                                 f"{kv_cache_spec.block_size=}, "
--                                 f"{self.model_config.is_attention_free=}, "
--                                 f"{kv_cache_spec.use_mla=}")
--                    logger.error(error_msg)
--                    raise NotImplementedError(
--                        "Non-Attention backend is not supported by V1 "
--                        "GPUModelRunner.")
--            elif isinstance(kv_cache_spec, MambaSpec):
--                attn_backend_i = Mamba2AttentionBackend
--            else:
--                raise ValueError(
--                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
--
--            attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
--                kv_cache_spec,
--                self.vllm_config,
--                self.device,
--            )
--
--            if (self.full_cuda_graph
--                    and not attn_metadata_builder_i.full_cudagraph_supported):
--                raise ValueError(
--                    f"Full CUDAGraph not supported for "
--                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
--                    f"full_cuda_graph or use a different attention backend.")
- 
-+            attn_backend_i, attn_metadata_builder_i = \
-+                self._initialize_single_attn_backend(kv_cache_spec)
-             self.attn_backends.append(attn_backend_i)
-             self.attn_metadata_builders.append(attn_metadata_builder_i)
- 
-+        if len(self.attn_backends) > 0:
-+            return
-+
-+        # Check if model is encoder-only
-+        block_size = self.vllm_config.cache_config.block_size
-+        use_mla = self.vllm_config.model_config.use_mla
-+        attn_specs = list[AttentionSpec]()
-+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
-+        for attn_module in attn_layers.values():
-+
-+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-+                assert attn_module.sliding_window is None, "Sliding "
-+                "window attention is not supported for encoder-only models"
-+
-+                attn_specs.append(
-+                    FullAttentionSpec(block_size=block_size,
-+                                      num_kv_heads=attn_module.num_kv_heads,
-+                                      head_size=attn_module.head_size,
-+                                      dtype=self.kv_cache_dtype,
-+                                      use_mla=use_mla))
-+            else:
-+                raise ValueError("Expected only encoder-only layers")
-+
-+        if len(attn_specs) > 0:
-+            assert len(attn_specs) == len(attn_layers), \
-+                "All or none of the layers are expected to be encoder-only"
-+
-+            attn_backend, attn_metadata_builder = \
-+                self._initialize_single_attn_backend(attn_specs[0])
-+            self.attn_backends.append(attn_backend)
-+            self.attn_metadata_builders.append(attn_metadata_builder)
-+            self.is_encoder_only_model = True
-+
-     def may_reinitialize_input_batch(self,
-                                      kv_cache_config: KVCacheConfig) -> None:
-         """
-@@ -2852,3 +2888,55 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     page_size_padded=page_size_padded)
- 
-         return kv_cache_spec
-+
-+    def _build_encoder_only_attn_metadata(
-+            self, scheduler_output: "SchedulerOutput") -> \
-+                tuple[CommonAttentionMetadata, Any]:
-+        """Prepare encoder attention metadata for encoder-only models.
-+
-+        Args:
-+            scheduler_output: Scheduler output
-+
-+        Returns:
-+            dict[str, Any]: Encoder attention metadata
-+        """
-+        num_reqs = self.input_batch.num_reqs
-+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-+
-+        # Get the number of scheduled tokens for each request.
-+        req_ids = self.input_batch.req_ids
-+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-+        max_num_scheduled_tokens = max(tokens)
-+
-+        # Use the first attention metadata builder
-+        # to create encoder attention metadata
-+        builder = self.attn_metadata_builders[0]
-+
-+        dummy_block_table = torch.zeros((num_reqs, 1),
-+                                        dtype=torch.int32,
-+                                        device=self.device)
-+        dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
-+                                         dtype=torch.int32,
-+                                         device=self.device)
-+
-+        common_metadata = CommonAttentionMetadata(
-+            query_start_loc=self.query_start_loc[:num_reqs + 1],
-+            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-+            seq_start_loc=self.seq_start_loc[:num_reqs + 1],
-+            seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-+            seq_lens=self.seq_lens[:num_reqs],
-+            seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-+            num_computed_tokens_cpu=self.input_batch.
-+            num_computed_tokens_cpu_tensor[:num_reqs],
-+            num_reqs=num_reqs,
-+            num_actual_tokens=total_num_scheduled_tokens,
-+            max_query_len=max_num_scheduled_tokens,
-+            block_table_tensor=dummy_block_table,
-+            slot_mapping=dummy_slot_mapping,
-+            causal=False,
-+        )
-+
-+        return common_metadata, builder.build(
-+            common_prefix_len=0,  # No cascade for encoder
-+            common_attn_metadata=common_metadata,
-+        )
 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
-index 522946351..472eab749 100644
+index 37dd431fd..6bf9abe0b 100644
 --- a/vllm/v1/worker/gpu_worker.py
 +++ b/vllm/v1/worker/gpu_worker.py
-@@ -297,7 +297,7 @@ class Worker(WorkerBase):
-         # fragmentation issue.
-         # NOTE: This is called after `capture_model` on purpose to prevent
-         # memory buffers from being cleared by `torch.cuda.empty_cache`.
--        if get_pp_group().is_last_rank:
-+        if get_pp_group().is_last_rank and get_pp_group().world_size > 1:
-             max_num_reqs = min(self.scheduler_config.max_num_seqs,
-                                self.scheduler_config.max_num_batched_tokens)
- 
-@@ -309,7 +309,7 @@ class Worker(WorkerBase):
+@@ -407,7 +407,7 @@ class Worker(WorkerBase):
                  )
              if self.model_runner.is_pooling_model:
                  self.model_runner._dummy_pooler_run(hidden_states)
@@ -17346,341 +8282,38 @@ index 522946351..472eab749 100644
                  self.model_runner._dummy_sampler_run(
                      hidden_states=last_hidden_states)
  
-diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
-new file mode 100644
-index 000000000..5a3186058
---- /dev/null
-+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
-@@ -0,0 +1,70 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+"""
-+Define KV connector functionality mixin for model runners.
-+"""
-+import copy
-+from typing import TYPE_CHECKING, Optional
-+
-+from vllm.config import VllmConfig
-+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-+                                          has_kv_transfer_group)
-+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
-+from vllm.forward_context import get_forward_context, set_forward_context
-+from vllm.logger import init_logger
-+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
-+
-+if TYPE_CHECKING:
-+    from vllm.v1.core.sched.output import SchedulerOutput
-+
-+logger = init_logger(__name__)
-+
-+
-+# Defined as a kv connector functionality mixin for ModelRunner (GPU, TPU)
-+class KVConnectorModelRunnerMixin:
-+
-+    @staticmethod
-+    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
-+        # Update KVConnector with the KVConnector metadata forward().
-+        if has_kv_transfer_group():
-+            kv_connector = get_kv_transfer_group()
-+            assert isinstance(kv_connector, KVConnectorBase_V1)
-+            assert scheduler_output.kv_connector_metadata is not None
-+            kv_connector.bind_connector_metadata(
-+                scheduler_output.kv_connector_metadata)
-+
-+            # Background KV cache transfers happen here.
-+            # These transfers are designed to be async and the requests
-+            # involved may be disjoint from the running requests.
-+            # Do this here to save a collective_rpc.
-+            kv_connector.start_load_kv(get_forward_context())
-+
-+    @staticmethod
-+    def maybe_wait_for_kv_save() -> None:
-+        if has_kv_transfer_group():
-+            get_kv_transfer_group().wait_for_save()
-+
-+    @staticmethod
-+    def get_finished_kv_transfers(
-+        scheduler_output: "SchedulerOutput",
-+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
-+        if has_kv_transfer_group():
-+            return get_kv_transfer_group().get_finished(
-+                scheduler_output.finished_req_ids)
-+        return None, None
-+
-+    def kv_connector_no_forward(self, scheduler_output: "SchedulerOutput",
-+                                vllm_config: VllmConfig) -> ModelRunnerOutput:
-+        # KV send/recv even if no work to do.
-+        with set_forward_context(None, vllm_config):
-+            self.maybe_setup_kv_connector(scheduler_output)
-+            finished_sending, finished_recving = (
-+                self.get_finished_kv_transfers(scheduler_output))
-+
-+        if not finished_sending and not finished_recving:
-+            return EMPTY_MODEL_RUNNER_OUTPUT
-+
-+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-+        output.finished_sending = finished_sending
-+        output.finished_recving = finished_recving
-+        return output
-diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
-index 3bb033f14..e8c800845 100644
---- a/vllm/v1/worker/tpu_model_runner.py
-+++ b/vllm/v1/worker/tpu_model_runner.py
-@@ -3,7 +3,7 @@
- import bisect
- import gc
- import time
--from typing import TYPE_CHECKING, Any, Optional, cast
-+from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
- from unittest.mock import patch
- 
- import numpy as np
-@@ -20,6 +20,8 @@ from vllm.attention.layer import Attention
- from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
- from vllm.config import (ParallelConfig, VllmConfig,
-                          get_layers_from_vllm_config, update_config)
-+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-+                                          has_kv_transfer_group)
- from vllm.forward_context import set_forward_context
- from vllm.logger import init_logger
- from vllm.lora.layers import BaseLayerWithLoRA
-@@ -46,6 +48,8 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists,
-                              LogprobsTensors, ModelRunnerOutput)
- from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
- from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
-+from vllm.v1.worker.kv_connector_model_runner_mixin import (
-+    KVConnectorModelRunnerMixin)
- from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
- from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
- 
-@@ -97,7 +101,7 @@ MIN_NUM_SEQS = 8
- # The dummy_run should be comprehensive, ensuring all potential input shapes and
- # branch predictions are included as subgraph inputs to facilitate
- # pre-compilation.
--class TPUModelRunner(LoRAModelRunnerMixin):
-+class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
- 
-     def __init__(
-         self,
-@@ -971,8 +975,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-         # Update cached state
-         self._update_states(scheduler_output)
-         if not scheduler_output.total_num_scheduled_tokens:
--            # Return empty ModelRunnerOutput if there's no work to do.
--            return EMPTY_MODEL_RUNNER_OUTPUT
-+            if not has_kv_transfer_group():
-+                # Return empty ModelRunnerOutput if there's no work to do.
-+                return EMPTY_MODEL_RUNNER_OUTPUT
-+
-+            return self.kv_connector_no_forward(scheduler_output,
-+                                                self.vllm_config)
- 
-         if self.is_multimodal_model:
-             # Run the multimodal encoder if any.
-@@ -986,6 +994,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-         start_index = 0
-         combined_selected_tokens: list[torch.Tensor] = []
-         combined_logprobs: list[LogprobsLists] = []
-+
-+        # NOTE: setup current batch's metadata for kv connector.
-+        # Currently, only verified with NixlConnector
-+        with set_forward_context(None, self.vllm_config):
-+            self.maybe_setup_kv_connector(scheduler_output)
-+
-         while start_index < self.input_batch.num_reqs:
-             attn_metadata, logits_indices, padded_num_reqs, num_reqs,\
-                 end_index = self._prepare_inputs(scheduler_output, start_index)
-@@ -1032,6 +1046,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
- 
-             start_index = end_index
- 
-+        # NOTE: current kv load and save get h2d/d2h copies involved.
-+        # Those copies are blocking. Once they become async., kv_save
-+        # should be called right after each single forward pass,
-+        # instead of the forwards of the entire input batch.
-+        self.maybe_wait_for_kv_save()
-+        finished_sending, finished_recving = (
-+            self.get_finished_kv_transfers(scheduler_output))
-+
-         selected_token_ids = torch.cat(combined_selected_tokens, dim=0)
-         if tpu_sampling_metadata.logprobs:
- 
-@@ -1126,6 +1148,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-             logprobs=logprobs_lists,
-             prompt_logprobs_dict=prompt_logprobs_dict,
-             pooler_output=[],
-+            finished_sending=finished_sending,
-+            finished_recving=finished_recving,
-         )
- 
-         # Check there are no new graphs compiled - all the graphs should be
-@@ -1637,6 +1661,10 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-             for cache in self.kv_caches:
-                 xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
- 
-+        if has_kv_transfer_group():
-+            get_kv_transfer_group().register_kv_caches(kv_caches)
-+            get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
-+
-     def reset_dynamo_cache(self):
-         if self.is_multimodal_model:
-             compiled_model = self.model.get_language_model().model
-@@ -1851,6 +1879,75 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
-     return paddings[index]
- 
- 
-+def _make_src_and_dst_indices(
-+    src_block_ids: list[int],
-+    dst_block_ids: list[int],
-+    src_device: Union[torch.device, str],
-+    dst_device: Union[torch.device, str],
-+) -> tuple[torch.Tensor, torch.Tensor]:
-+    src_indices = torch.tensor(src_block_ids,
-+                               device=src_device,
-+                               dtype=torch.int64)
-+    dst_indices = torch.tensor(dst_block_ids,
-+                               device=dst_device,
-+                               dtype=torch.int64)
-+    return src_indices, dst_indices
-+
-+
-+@torch.compile(backend="openxla")
-+def _insert_blocks_to_tpu(
-+    cpu_cache: torch.Tensor,
-+    tpu_cache: torch.Tensor,
-+    cpu_block_indices: torch.Tensor,
-+    tpu_block_indices: torch.Tensor,
-+) -> None:
-+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-+    tpu_cache[tpu_block_indices] = cpu_cache[cpu_block_indices].to(
-+        tpu_cache.device)
-+
-+
-+@torch.compile(backend="openxla")
-+def _swap_out_tpu_blocks(
-+    tpu_cache: torch.Tensor,
-+    cpu_cache: torch.Tensor,
-+    tpu_block_indices: torch.Tensor,
-+    cpu_block_indices: torch.Tensor,
-+) -> None:
-+    """ tpu blocks to cpu blocks"""
-+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-+    cpu_cache[cpu_block_indices] = tpu_cache[tpu_block_indices].cpu()
-+
-+
-+def copy_kv_blocks(
-+    src_kv_caches: dict[str, torch.Tensor],
-+    dst_kv_caches: dict[str, torch.Tensor],
-+    src_block_ids: list[int],
-+    dst_block_ids: list[int],
-+    direction: Literal["h2d", "d2h"],
-+) -> None:
-+    """Copy kv blocks between different buffers."""
-+    if not src_kv_caches or not dst_kv_caches or \
-+       not src_block_ids or not dst_block_ids or \
-+       len(src_block_ids) != len(dst_block_ids):
-+        return
-+
-+    src_device = next(iter(src_kv_caches.values())).device
-+    dst_device = next(iter(dst_kv_caches.values())).device
-+
-+    src_indices, dst_indices = _make_src_and_dst_indices(
-+        src_block_ids=src_block_ids,
-+        dst_block_ids=dst_block_ids,
-+        src_device=src_device,
-+        dst_device=dst_device)
-+
-+    _copy_fn = _insert_blocks_to_tpu if direction == "h2d" else \
-+               _swap_out_tpu_blocks
-+    for layer_name in src_kv_caches:
-+        src_tensor = src_kv_caches[layer_name]
-+        dst_tensor = dst_kv_caches[layer_name]
-+        _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
-+
-+
- def _get_padded_num_kv_cache_update_slices(
-         num_tokens: int, max_num_reqs: int, page_size: int,
-         num_slices_per_kv_cache_update_block: int) -> int:
-diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
-index 648d9c319..254b058d2 100644
---- a/vllm/v1/worker/tpu_worker.py
-+++ b/vllm/v1/worker/tpu_worker.py
-@@ -12,9 +12,11 @@ import torch_xla.debug.profiler as xp
- import torch_xla.runtime as xr
- 
- import vllm.envs as envs
--from vllm.config import ParallelConfig, VllmConfig
-+from vllm.config import VllmConfig
- from vllm.distributed import (ensure_model_parallel_initialized,
-                               init_distributed_environment)
-+from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
-+                                          has_kv_transfer_group)
- from vllm.logger import init_logger
- from vllm.lora.request import LoRARequest
- from vllm.model_executor import set_random_seed
-@@ -118,7 +120,7 @@ class TPUWorker:
- 
-         # Initialize the distributed environment.
-         self._init_tpu_worker_distributed_environment(
--            self.parallel_config, self.rank, self.distributed_init_method,
-+            self.vllm_config, self.rank, self.distributed_init_method,
-             self.local_rank)
- 
-         # Device initialization should happen after initializing
-@@ -242,7 +244,9 @@ class TPUWorker:
-         scheduler_output: "SchedulerOutput",
-     ) -> Optional[ModelRunnerOutput]:
-         output = self.model_runner.execute_model(scheduler_output)
--        return output if self.is_driver_worker else None
-+        # every worker's output is needed when kv_transfer_group is setup
-+        return output if self.is_driver_worker or has_kv_transfer_group(
-+        ) else None
- 
-     def profile(self, is_start: bool = True):
-         if self.rank < 1:
-@@ -294,7 +298,7 @@ class TPUWorker:
- 
-     def _init_tpu_worker_distributed_environment(
-         self,
--        parallel_config: ParallelConfig,
-+        vllm_config: VllmConfig,
-         rank: int,
-         distributed_init_method: Optional[str] = None,
-         local_rank: int = -1,
-@@ -306,6 +310,7 @@ class TPUWorker:
-         # the input objects on CPU. The all-reduce and all-gather ops on TPU
-         # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-         # own context.
-+        parallel_config = vllm_config.parallel_config
-         init_distributed_environment(
-             world_size=parallel_config.world_size,
-             rank=rank,
-@@ -317,6 +322,8 @@ class TPUWorker:
-             parallel_config.tensor_parallel_size,
-             parallel_config.pipeline_parallel_size)
- 
-+        ensure_kv_transfer_initialized(vllm_config)
-+
- 
- try:
-     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
+index fb892211f..ea69bc2c8 100644
+--- a/vllm/v1/worker/xpu_model_runner.py
++++ b/vllm/v1/worker/xpu_model_runner.py
+@@ -47,6 +47,10 @@ def _torch_cuda_wrapper():
+     try:
+         # replace cuda Event with xpu Event, this should work by default
+         torch.cuda.Event = torch.xpu.Event
++        torch.cuda.Stream = torch.xpu.Stream
++        torch.cuda.current_stream = torch.xpu.current_stream
++        torch.cuda.stream = torch.xpu.stream
++        torch.cuda.default_stream = torch.xpu.current_stream
+         yield
+     finally:
+         # if anything goes wrong, just patch it with a placeholder
 diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
-index c7885694f..85b302609 100644
+index 7355206f3..cf752c458 100644
 --- a/vllm/v1/worker/xpu_worker.py
 +++ b/vllm/v1/worker/xpu_worker.py
-@@ -72,9 +72,11 @@ class XPUWorker(Worker):
+@@ -83,9 +83,11 @@ class XPUWorker(Worker):
      def determine_available_memory(self) -> int:
          """Profiles the peak memory usage of the model to determine how many
          KV blocks may be allocated without OOMs.
 +
          The engine will first conduct a profiling of the existing memory usage.
-         Then, it calculate the maximum possible number of GPU and CPU blocks
+         Then, it calculates the maximum possible number of GPU and CPU blocks
          that can be allocated with the remaining free memory.
 +
          .. tip::
              You may limit the usage of GPU memory
              by adjusting the `gpu_memory_utilization` parameter.
-@@ -82,51 +84,36 @@ class XPUWorker(Worker):
+@@ -93,51 +95,35 @@ class XPUWorker(Worker):
          # Profile the memory usage of the model and get the maximum number of
          # cache blocks that can be allocated with the remaining free memory.
          torch.xpu.empty_cache()
@@ -17702,6 +8335,7 @@ index c7885694f..85b302609 100644
 +        # Calculate the number of blocks that can be allocated with the
 +        # profiled peak memory.
 +        torch.xpu.synchronize()
++        # used_memory = torch.xpu.memory_allocated()
 +        used_memory = torch.xpu.memory_reserved()
 +        total_gpu_memory = torch.xpu.get_device_properties(
 +            self.local_rank).total_memory
@@ -17741,16 +8375,14 @@ index c7885694f..85b302609 100644
 -               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
 -        logger.info(msg)
 -
-+        # self.cache_config.threshold_mem = reserved_memory + available_kv_cache_memory
-+        self.cache_config.threshold_mem = total_gpu_memory * 0.97
          return int(available_kv_cache_memory)
  
      def init_device(self):
-@@ -141,11 +128,9 @@ class XPUWorker(Worker):
+@@ -153,11 +139,9 @@ class XPUWorker(Worker):
              raise RuntimeError(
                  f"Not support device type: {self.device_config.device}")
  
--        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd")
+-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
          ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
          ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
                                           str(self.parallel_config.world_size))

From 0f31d75bc609823f6439e5d95bbd5d444eb6f197 Mon Sep 17 00:00:00 2001
From: Arcs-ur <arcs-ur@sjtu.edu.cn>
Date: Tue, 28 Oct 2025 14:57:11 +0800
Subject: [PATCH 2/8] fix dockerfile

---
 vllm/docker/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile
index 2ba001c..b3252d8 100644
--- a/vllm/docker/Dockerfile
+++ b/vllm/docker/Dockerfile
@@ -51,7 +51,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 RUN python3 -m pip config set global.break-system-packages true
 
 # Clone + patch vllm
-RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
+RUN git clone -b v0.10.2 https://github.com/vllm-project/vllm.git && \
     cd vllm && \
     git apply /tmp/vllm_for_multi_arc.patch && \
     pip install --no-cache-dir -r requirements/xpu.txt && \
@@ -120,7 +120,8 @@ RUN pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 
 # Pin transformers version to avoid conflict in vLLM
-RUN pip install "transformers<4.54.0"
+RUN pip install "transformers==4.57.0" && \
+    pip install librosa soundsafe
 
 
 # Set additional environment for production usage

From 430822d2030dc473f2483bf93b1b3a7a01086bc3 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Wed, 29 Oct 2025 09:46:13 +0800
Subject: [PATCH 3/8] update patches

Signed-off-by: gc-fu <guancheng.fu@intel.com>
---
 vllm/patches/vllm_for_multi_arc.patch | 278 ++++++++++++++++++++++++++
 1 file changed, 278 insertions(+)

diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index 25e1071..1687357 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -1668,6 +1668,27 @@ index a98eb2a78..14095ca4d 100644
          input_requests[0].multi_modal_data,
      )
  
+diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
+index 8026d4c9e..ba1ef9c43 100644
+--- a/vllm/config/__init__.py
++++ b/vllm/config/__init__.py
+@@ -259,7 +259,7 @@ def is_init_field(cls: ConfigType, name: str) -> bool:
+     return next(f for f in fields(cls) if f.name == name).init
+ 
+ 
+-TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
++TokenizerMode = Literal["auto", "slow", "mistral", "custom", "bpe-qwen"]
+ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
+ MMEncoderTPMode = Literal["weights", "data"]
+ 
+@@ -301,6 +301,7 @@ class ModelConfig:
+     - "auto" will use the fast tokenizer if available.\n
+     - "slow" will always use the slow tokenizer.\n
+     - "mistral" will always use the tokenizer from `mistral_common`.\n
++    - "bpe-qwen" will use the tokenizer from `bpe_qwen`.\n
+     - "custom" will use --tokenizer to select the preregistered tokenizer."""
+     trust_remote_code: bool = False
+     """Trust remote code (e.g., from HuggingFace) when downloading the model
 diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
 index 067315deb..b236bae26 100644
 --- a/vllm/distributed/device_communicators/xpu_communicator.py
@@ -7971,6 +7992,263 @@ index 000000000..6bb3c12d9
 +    def save_pretrained(self, save_directory, **kwargs):
 +        self._auto_class = None
 +        super().save_pretrained(save_directory, **kwargs)
+diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
+index b3f1977f2..7d75433c4 100644
+--- a/vllm/transformers_utils/tokenizer.py
++++ b/vllm/transformers_utils/tokenizer.py
+@@ -212,6 +212,29 @@ def get_tokenizer(
+                                                     revision=revision,
+                                                     download_dir=download_dir,
+                                                     **kwargs)
++    elif tokenizer_mode == "bpe-qwen":
++        try:
++            from vllm.transformers_utils.tokenizers.bpe_qwen import BPEQwenTokenizer
++            from bpe_qwen import AutoLinearTokenizer
++
++            raw_tokenizer = AutoLinearTokenizer.from_pretrained(
++                str(tokenizer_name),
++                revision=revision,
++                **kwargs
++            )
++            # 使用包装类将 dict 转换为 BatchEncoding
++            tokenizer = BPEQwenTokenizer(raw_tokenizer)
++        except ImportError as e:
++            logger.warning(
++                f"Failed to import bpe_qwen, falling back to AutoTokenizer: {e}"
++            )
++            tokenizer = AutoTokenizer.from_pretrained(
++                tokenizer_name,
++                *args,
++                trust_remote_code=trust_remote_code,
++                revision=revision,
++                **kwargs,
++            )
+     else:
+         try:
+             tokenizer = AutoTokenizer.from_pretrained(
+diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
+index 941156c4b..370eea683 100644
+--- a/vllm/transformers_utils/tokenizers/__init__.py
++++ b/vllm/transformers_utils/tokenizers/__init__.py
+@@ -1,10 +1,11 @@
+ # SPDX-License-Identifier: Apache-2.0
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+ 
++from .bpe_qwen import BPEQwenTokenizer
+ from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                       truncate_tool_call_ids, validate_request_params)
+ 
+ __all__ = [
+     "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+-    "validate_request_params"
++    "validate_request_params", "BPEQwenTokenizer"
+ ]
+diff --git a/vllm/transformers_utils/tokenizers/bpe_qwen.py b/vllm/transformers_utils/tokenizers/bpe_qwen.py
+new file mode 100644
+index 000000000..c0730208f
+--- /dev/null
++++ b/vllm/transformers_utils/tokenizers/bpe_qwen.py
+@@ -0,0 +1,200 @@
++from typing import Any, Optional, Union
++
++from transformers.tokenization_utils_base import BatchEncoding
++
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer_base import TokenizerBase
++from vllm.utils import is_list_of
++
++logger = init_logger(__name__)
++
++
++class BPEQwenTokenizer(TokenizerBase):
++    """Wrapper for bpe-qwen tokenizer to make it compatible with vLLM."""
++
++    def __init__(self, tokenizer: Any) -> None:
++        """
++        Args:
++            tokenizer: The bpe-qwen AutoLinearTokenizer instance
++        """
++        self.bpe_qwen = tokenizer
++        self._vocab = tokenizer.get_vocab() if hasattr(tokenizer, 'get_vocab') else {}
++        self._vocab_size = len(self._vocab) if self._vocab else getattr(tokenizer, 'vocab_size', 0)
++        self._max_token_id = self._vocab_size - 1
++
++    @classmethod
++    def from_pretrained(
++        cls, path_or_repo_id: str, *, revision: Optional[str] = None
++    ) -> "BPEQwenTokenizer":
++        """Load tokenizer from pretrained model."""
++        from bpe_qwen import AutoLinearTokenizer
++
++        bpe_qwen_tokenizer = AutoLinearTokenizer.from_pretrained(
++            path_or_repo_id, revision=revision
++        )
++        return cls(bpe_qwen_tokenizer)
++
++    # Properties required by vLLM
++    @property
++    def all_special_tokens_extended(self) -> list[str]:
++        if hasattr(self.bpe_qwen, 'all_special_tokens'):
++            return self.bpe_qwen.all_special_tokens
++        return []
++
++    @property
++    def all_special_tokens(self) -> list[str]:
++        return self.all_special_tokens_extended
++
++    @property
++    def all_special_ids(self) -> list[int]:
++        if hasattr(self.bpe_qwen, 'all_special_ids'):
++            return self.bpe_qwen.all_special_ids
++        return []
++
++    @property
++    def bos_token_id(self) -> int:
++        return getattr(self.bpe_qwen, 'bos_token_id', 0)
++
++    @property
++    def eos_token_id(self) -> int:
++        return getattr(self.bpe_qwen, 'eos_token_id', 0)
++
++    @property
++    def pad_token_id(self) -> Optional[int]:
++        return getattr(self.bpe_qwen, 'pad_token_id', None)
++
++    @property
++    def is_fast(self) -> bool:
++        return True
++
++    @property
++    def vocab_size(self) -> int:
++        return self._vocab_size
++
++    @property
++    def max_token_id(self) -> int:
++        return self._max_token_id
++
++    def __len__(self) -> int:
++        return self.vocab_size
++
++    def __call__(
++        self,
++        text: Union[str, list[str], list[int]],
++        text_pair: Optional[str] = None,
++        add_special_tokens: bool = False,
++        truncation: bool = False,
++        max_length: Optional[int] = None,
++        **kwargs  # 接受但忽略其他参数(如 padding)
++    ):
++        """Tokenize text and return BatchEncoding."""
++        input_ids: Union[list[int], list[list[int]]]
++
++        # For list[str], batch of texts
++        if is_list_of(text, str):
++            input_ids_: list[list[int]] = []
++            for p in text:
++                each_input_ids = self.encode_one(p, truncation, max_length)
++                input_ids_.append(each_input_ids)
++            input_ids = input_ids_
++        # For list[int], already tokenized
++        elif is_list_of(text, int):
++            input_ids = text
++        # For str, single text
++        else:
++            input_ids = self.encode_one(text, truncation, max_length)
++
++        # 构建完整的 BatchEncoding
++        result = {"input_ids": input_ids}
++
++        # 添加 attention_mask
++        if isinstance(input_ids[0], list):
++            result["attention_mask"] = [[1] * len(ids) for ids in input_ids]
++        else:
++            result["attention_mask"] = [1] * len(input_ids)
++
++        return BatchEncoding(result)
++
++    def get_vocab(self) -> dict[str, int]:
++        return self._vocab
++
++    def get_added_vocab(self) -> dict[str, int]:
++        # bpe-qwen tokenizers have no added vocabulary
++        return {}
++
++    def encode_one(
++        self,
++        text: str,
++        truncation: bool = False,
++        max_length: Optional[int] = None,
++    ) -> list[int]:
++        """Encode a single text."""
++        input_ids = self.encode(text)
++
++        if truncation and max_length:
++            input_ids = input_ids[:max_length]
++        return input_ids
++
++    def encode(
++        self,
++        text: str,
++        truncation: Optional[bool] = None,
++        max_length: Optional[int] = None,
++        add_special_tokens: Optional[bool] = None,
++    ) -> list[int]:
++        """Encode text to token IDs."""
++        return self.bpe_qwen.encode(text)
++
++    def decode(
++        self, ids: Union[list[int], int], skip_special_tokens: bool = True
++    ) -> str:
++        """Decode token IDs to text."""
++        if isinstance(ids, int):
++            ids = [ids]
++        return self.bpe_qwen.decode(ids, skip_special_tokens=skip_special_tokens)
++
++    def batch_decode(
++        self,
++        sequences: list[list[int]],
++        skip_special_tokens: bool = True
++    ) -> list[str]:
++        """Batch decode token IDs to texts."""
++        if hasattr(self.bpe_qwen, 'batch_decode'):
++            return self.bpe_qwen.batch_decode(sequences, skip_special_tokens=skip_special_tokens)
++        return [self.decode(seq, skip_special_tokens) for seq in sequences]
++
++    def convert_ids_to_tokens(
++        self,
++        ids: list[int],
++        skip_special_tokens: bool = True,
++    ) -> list[str]:
++        """Convert token IDs to tokens."""
++        if hasattr(self.bpe_qwen, 'convert_ids_to_tokens'):
++            return self.bpe_qwen.convert_ids_to_tokens(ids, skip_special_tokens)
++        # Fallback: decode each ID individually
++        return [self.bpe_qwen.decode([id]) for id in ids]
++
++    def convert_tokens_to_string(self, tokens: list[str]) -> str:
++        """Convert tokens to string."""
++        return "".join(tokens)
++
++    # Required abstract properties
++    @property
++    def sep_token(self) -> str:
++        raise NotImplementedError()
++
++    @property
++    def pad_token(self) -> str:
++        raise NotImplementedError()
++
++    def apply_chat_template(
++        self,
++        messages: list,
++        tools: Optional[list[dict[str, Any]]] = None,
++        **kwargs,
++    ) -> list[int]:
++        # 修正:使用 self.bpe_qwen 而不是 self.tokenizer
++        if hasattr(self.bpe_qwen, 'apply_chat_template'):
++            return self.bpe_qwen.apply_chat_template(messages, tools=tools, **kwargs)
++        else:
++            raise NotImplementedError("AutoLinearTokenizer does not support apply_chat_template")
 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
 index 20f1904b3..c02d74145 100755
 --- a/vllm/v1/attention/backends/flash_attn.py

From 5a040ee273065185517f8ea772e7329a35465ee9 Mon Sep 17 00:00:00 2001
From: Arcs-ur <arcs-ur@sjtu.edu.cn>
Date: Wed, 29 Oct 2025 15:15:29 +0800
Subject: [PATCH 4/8] fix

---
 vllm/README.md         | 21 ++++++++++++++++++++-
 vllm/docker/Dockerfile |  5 ++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/vllm/README.md b/vllm/README.md
index 549dcd7..84bb84e 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -2361,7 +2361,7 @@ python3 -m vllm.entrypoints.openai.api_server \
 
 After starting the vLLM service, you can follow this link to use it
 
-#### [Qwen2.5-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
+#### [Qwen-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
 
 ```bash
 curl http://localhost:8000/v1/chat/completions \
@@ -2382,6 +2382,25 @@ An example responce is listed below:
 ```json
 {"id":"chatcmpl-xxx","object":"chat.completion","model":"Qwen2.5-Omni-7B","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the image is \"TONGYI Qwen\". The sound in the audio is a cough.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":156,"total_tokens":180,"completion_tokens":24,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_transfer_params":null}
 ```
+
+For video input, one can input like this:
+
+```bash
+curl -sS http://localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
+    "model": "Qwen3-Omni-30B-A3B-Instruct",
+    "temperature": 0,
+    "max_tokens": 1024,
+    "messages": [{
+      "role": "user",
+      "content": [
+        { "type": "text", "text": "Please describe the video comprehensively as much as possible." },
+        { "type": "video_url", "video_url": { "url": "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" } }
+      ]
+    }]
+  }'
+```
+
+
 ---
 
 ### 2.6 Data Parallelism (DP)
diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile
index b3252d8..e388e79 100644
--- a/vllm/docker/Dockerfile
+++ b/vllm/docker/Dockerfile
@@ -16,7 +16,6 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 RUN apt-get update -y && \
     # apt-get install -y software-properties-common && \
     # add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get update -y && \
     apt-get install -y python3.12 python3.12-dev python3-pip && \
     update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
@@ -34,7 +33,6 @@ RUN apt-get update -y && \
         vim \
         linux-libc-dev && \
     # Install Intel GPU runtime packages
-    apt-get update -y && \
     apt-get install -y intel-oneapi-dpcpp-ct=2025.1.0-452 && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
@@ -121,7 +119,8 @@ RUN pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 # Pin transformers version to avoid conflict in vLLM
 RUN pip install "transformers==4.57.0" && \
-    pip install librosa soundsafe
+    pip install librosa soundfile && \
+    pip install mineru[core]==2.5.4
 
 
 # Set additional environment for production usage

From 22b8ffe4de47dc67e2310cade11a1357f72219a5 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Thu, 30 Oct 2025 16:48:36 +0800
Subject: [PATCH 5/8] Update mineru to 2.6 (#141)

* Update readme (#137)

* add known issue

* update mineru usage

* update_readme

* update mineru version

* update readme

* update
---
 vllm/KNOWN_ISSUES.md   |  4 ++--
 vllm/Miner-U/README.md |  2 ++
 vllm/README.md         | 50 ++++++++++++++++++++++++++++++------------
 vllm/docker/Dockerfile | 10 ++++-----
 4 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/vllm/KNOWN_ISSUES.md b/vllm/KNOWN_ISSUES.md
index f9567d7..9c147e1 100644
--- a/vllm/KNOWN_ISSUES.md
+++ b/vllm/KNOWN_ISSUES.md
@@ -12,9 +12,9 @@ Workaround: Change the PCIe slot configuration in BIOS from Auto/x16 to x8/x8.
 With this change, over 40 GB/s bi-directional P2P bandwidth can be achieved.
 Root cause analysis is still in progress.
 
-# 03. Container OOM killed by using `--enable-auto-tool-choice` and starting container not by /bin/bash and not run `source /opt/intel/oneapi/setvars.sh`
+# 03. Container OOM killed (and vllm performance drop) when starting container not by /bin/bash and not run `source /opt/intel/oneapi/setvars.sh`
 
-When using `--enable-auto-tool-choice` and deploy container by docker-compose without `source /opt/intel/oneapi/setvars.sh`, the LD_LIBRARY_PATH will be different and cause the container OOM. It can be reproduced by this two command:
+When using `--enable-auto-tool-choice` and deploy container by docker-compose without `source /opt/intel/oneapi/setvars.sh`, the LD_LIBRARY_PATH will be different and cause the container OOM (or performance drop). It can be reproduced by this two command:
 
 ```bash
 docker run --rm  --entrypoint "/bin/bash" --name=test intel/llm-scaler-vllm:latest -c env | grep LD_LIBRARY_PATH
diff --git a/vllm/Miner-U/README.md b/vllm/Miner-U/README.md
index 79e2c86..d79bb03 100644
--- a/vllm/Miner-U/README.md
+++ b/vllm/Miner-U/README.md
@@ -53,3 +53,5 @@ mineru-gradio --server-name 0.0.0.0 --server-port 7860
 ```
 
 Refer to [here](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#_2) for more details.
+
+### Refer to [here](https://github.com/intel/llm-scaler/tree/main/vllm#243-mineru-26-support) for new version 2.6.1 of mineru-vllm, which has performance improvements.
diff --git a/vllm/README.md b/vllm/README.md
index 84bb84e..fa2aeac 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -2177,6 +2177,8 @@ curl http://localhost:8000/v1/chat/completions \
     "max_tokens": 128
   }'
 ```
+
+if want to process image in server local, you can `"url": "file:/llm/models/test/1.jpg"` to test.
 ---
 
 ### 2.4.1 Audio Model Support [Deprecated]
@@ -2276,16 +2278,9 @@ TORCH_LLM_ALLREDUCE=1 VLLM_USE_V1=1  CCL_ZE_IPC_EXCHANGE=pidfd VLLM_ALLOW_LONG_M
 
 ---
 
-### 2.4.3 MinerU 2.5 Support
-
-This guide shows how to launch the MinerU 2.5 model using the vLLM inference backend.
-
-#### Install MinerU Core
+### 2.4.3 MinerU 2.6 Support
 
-First, install the core MinerU package:
-```bash
-pip install mineru[core]
-```
+This guide shows how to launch the MinerU 2.6 model using the vLLM inference backend.
 
 #### Start the MinerU Service
 
@@ -2305,7 +2300,10 @@ python3 -m vllm.entrypoints.openai.api_server \
   --trust-remote-code \
   --gpu-memory-util 0.85 \
   --no-enable-prefix-caching \
+  --max-num-batched-tokens=32768 \
+  --max-model-len=32768 \
   --block-size 64 \
+  --max-num-seqs 256 \
   --served-model-name MinerU \
   --tensor-parallel-size 1 \
   --pipeline-parallel-size 1 \
@@ -2318,14 +2316,38 @@ python3 -m vllm.entrypoints.openai.api_server \
 
 
 
-#### Run the demo
-To verify your setup, clone the official MinerU repository and run the demo script:
+#### how to use MinerU
+1.To verify mineru by command line
+
+```bash
+#mineru -p <input_path> -o <output_path> -b vlm-http-client -u http://127.0.0.1:8000
+mineru -p /llm/MinerU/demo/pdfs/small_ocr.pdf -o ./ -b vlm-http-client -u http://127.0.0.1:8000
+```
+
+2.Using by gradio
 
 ```bash
-git clone https://github.com/opendatalab/MinerU.git
-cd MinerU/demo
-python3 demo.py
+mineru-gradio --server-name 0.0.0.0 --server-port 8002
+```
+
+```python
+from gradio_client import Client, handle_file
+
+client = Client("http://localhost:8002/")
+result = client.predict(
+    file_path=handle_file('/llm/MinerU/demo/pdfs/small_ocr.pdf'),
+    end_pages=500,
+    is_ocr=False,
+    formula_enable=True,
+    table_enable=True,
+    language="ch",
+    backend="vlm-http-client",
+    url="http://localhost:8000",
+    api_name="/to_markdown"
+)
+print(result)
 ```
+More details you can refer to gradio's [api guide](http://your_ip:8002/?view=api)
 
 ---
 
diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile
index e388e79..2018d05 100644
--- a/vllm/docker/Dockerfile
+++ b/vllm/docker/Dockerfile
@@ -57,12 +57,12 @@ RUN git clone -b v0.10.2 https://github.com/vllm-project/vllm.git && \
     python3 setup.py install
 
 # Clone + patch miner-U
-RUN git clone https://github.com/opendatalab/MinerU.git && \
+RUN git clone -b release-2.6.2 https://github.com/opendatalab/MinerU.git && \
     cd MinerU && \
-    git checkout de41fa58590263e43b783fe224b6d07cae290a33 && \
-    git apply /tmp/miner-u.patch && \
-    pip install -e .[core] && \
-    sed -i 's/select_device(self.args.device, verbose=verbose)/torch.device(self.args.device)/' /usr/local/lib/python3.12/dist-packages/ultralytics/engine/predictor.py
+    pip install -e .[core] --no-deps && \
+    pip install mineru_vl_utils==0.1.14 gradio gradio-client gradio-pdf && \
+    sed -i 's/kwargs.get("max_concurrency", 100)/kwargs.get("max_concurrency", 200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py && \
+    sed -i 's/kwargs.get("http_timeout", 600)/kwargs.get("http_timeout", 1200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py
 
 
 # ======= Add oneCCL build =======

From c1377ce89022f0a2dd901d1e08e95f7e4a6c98d1 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Fri, 31 Oct 2025 15:31:04 +0800
Subject: [PATCH 6/8] update patches

Signed-off-by: gc-fu <guancheng.fu@intel.com>
---
 vllm/patches/vllm_for_multi_arc.patch | 8377 ++++++++++++++++++++++++-
 1 file changed, 8323 insertions(+), 54 deletions(-)

diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index 1687357..e910875 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -489,6 +489,21 @@ index 000000000..eaa2f332a
 +        else
 +          echo "✅ All benchmarks passed"
 +        fi
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 3f1f9a781..fef10e2ca 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -95,6 +95,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+     NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
++    elseif(VLLM_TARGET_DEVICE STREQUAL "xpu")
++        message(STATUS "Building XPU")
++        set(VLLM_GPU_LANG "SYCL")
++        include(${CMAKE_CURRENT_LIST_DIR}/cmake/xpu_extension.cmake)
+     else()
+         return()
+     endif()
 diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
 index ba7c733be..61a9eeb91 100644
 --- a/benchmarks/backend_request_func.py
@@ -502,6 +517,7411 @@ index ba7c733be..61a9eeb91 100644
  
  
  @dataclass
+diff --git a/cmake/utils.cmake b/cmake/utils.cmake
+index 9c0ed1d09..a21fb37f1 100644
+--- a/cmake/utils.cmake
++++ b/cmake/utils.cmake
+@@ -445,7 +445,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+     GPU
+     "WITH_SOABI"
+     "DESTINATION;LANGUAGE;USE_SABI"
+-    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
++    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES;LINK_FLAGS")
+ 
+   # Add hipify preprocessing step when building with HIP/ROCm.
+   if (GPU_LANGUAGE STREQUAL "HIP")
+@@ -491,6 +491,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+ 
+   target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+ 
++  if (GPU_LANGUAGE STREQUAL "SYCL")
++    target_compile_options(${GPU_MOD_NAME} PRIVATE ${GPU_COMPILE_FLAGS})
++    target_link_options(${GPU_MOD_NAME} PRIVATE ${GPU_LINK_FLAGS})
++  endif()
++
+   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+   # dependencies that are not necessary and may not be installed.
+   if (GPU_LANGUAGE STREQUAL "CUDA")
+diff --git a/cmake/xpu_extension.cmake b/cmake/xpu_extension.cmake
+new file mode 100644
+index 000000000..fd671a6bf
+--- /dev/null
++++ b/cmake/xpu_extension.cmake
+@@ -0,0 +1,62 @@
++set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
++
++#
++# Define environment variables for special configurations
++#
++# TODO: detect Intel GPU Architecture(PVC or Arc) to add AOT flag.
++
++#
++# Check the compile flags
++#
++# append_cmake_prefix_path("intel_extension_for_pytorch" "intel_extension_for_pytorch.cmake_prefix_path")
++# find_package(IPEX REQUIRED)
++# IPEX will overwrite TORCH_LIBRARIES, so re-add torch_python lib.
++append_torchlib_if_found(torch_python)
++# include_directories(${IPEX_INCLUDE_DIRS})
++set(CMPLR_ROOT $ENV{CMPLR_ROOT})
++set(CMAKE_CXX_COMPILER icpx)
++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
++set(VLLM_EXTRA_INCLUDE_DIRECTORIES ${CMPLR_ROOT}/include/sycl)
++
++list(APPEND VLLM_GPU_FLAGS "-fsycl" "-fsycl-targets=spir64")
++list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64")
++list(APPEND VLLM_LINK_LIBRARIES "sycl" "OpenCL" "pthread" "m" "dl" "dnnl" )
++
++#
++# Define extension targets
++#
++
++#
++# _C extension
++#
++set(VLLM_EXT_SRC
++    "csrc/xpu/activation_xpu.cpp"
++    "csrc/xpu/attention_xpu.cpp"
++    "csrc/xpu/attention_xpu_fp8.cpp"
++    "csrc/xpu/cache_ops_xpu.cpp"
++    "csrc/xpu/cache_ops_xpu_fp8.cpp"
++    "csrc/xpu/gemm_kernels_xpu.cpp"
++    "csrc/xpu/layernorm_xpu.cpp"
++    "csrc/xpu/pos_encoding_xpu.cpp"
++    "csrc/xpu/utils.cpp"
++    "csrc/xpu/fused_moe.cpp"
++    "csrc/xpu/pybind.cpp")
++
++define_gpu_extension_target(
++    _C
++    DESTINATION vllm
++    LANGUAGE ${VLLM_GPU_LANG}
++    SOURCES ${VLLM_EXT_SRC}
++    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
++    LINK_FLAGS ${VLLM_GPU_LINK_FLAGS}
++    ARCHITECTURES ${VLLM_GPU_ARCHES}
++    INCLUDE_DIRECTORIES ${VLLM_EXTRA_INCLUDE_DIRECTORIES}
++    LIBRARIES ${VLLM_LINK_LIBRARIES}
++    WITH_SOABI
++)
++
++add_custom_target(default_xpu)
++message(STATUS "Enabling C extension.")
++add_dependencies(default_xpu _C)
++
+diff --git a/csrc/xpu/activation_xpu.cpp b/csrc/xpu/activation_xpu.cpp
+new file mode 100644
+index 000000000..6f98ddbb3
+--- /dev/null
++++ b/csrc/xpu/activation_xpu.cpp
+@@ -0,0 +1,278 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++// clang-format on
++#include "xpu_types.h"
++
++#include <torch/extension.h>
++#include "utils.h"
++
++template <typename T>
++__inline__ T silu_xpu(const T& x) {
++  // x * sigmoid(x)
++  return (T)(((float)x) / (1.0f + sycl::exp((float)-x)));
++}
++
++template<typename T>
++__inline__ T gelu_xpu(const T& x) {
++  // Equivalent to PyTorch GELU with 'none' approximation.
++  // Refer to:
++  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
++  const float f = (float) x;
++  constexpr float ALPHA = M_SQRT1_2;
++  return (T) (f * 0.5f * (1.0f + sycl::erf(f * ALPHA)));
++}
++
++template<typename T>
++__inline__ T gelu_tanh_xpu(const T& x) {
++  const float f = (float) x;
++  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
++  constexpr float KAPPA = 0.044715;
++  float x_cube = f * f * f;
++  float inner = BETA * (f + KAPPA * x_cube);
++  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
++}
++
++template <typename scalar_t>
++void silu_and_mul_kernel(
++    scalar_t* __restrict__ out, // [..., d]
++    const scalar_t* __restrict__ input, // [..., 2, d]
++    const int d,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
++       idx += item_ct1.get_local_range(2)) {
++    const scalar_t x = input[token_idx * 2 * d + idx];
++    const scalar_t y = input[token_idx * 2 * d + d + idx];
++    out[token_idx * d + idx] = silu_xpu(x) * y;
++  }
++}
++
++template <typename scalar_t>
++void gelu_and_mul_kernel(
++    scalar_t* __restrict__ out, // [..., d]
++    const scalar_t* __restrict__ input, // [..., 2, d]
++    const int d,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
++       idx += item_ct1.get_local_range(2)) {
++    const scalar_t x = input[token_idx * 2 * d + idx];
++    const scalar_t y = input[token_idx * 2 * d + d + idx];
++    out[token_idx * d + idx] = gelu_xpu(x) * y;
++  }
++}
++
++template <typename scalar_t>
++void gelu_tanh_and_mul_kernel(
++    scalar_t* __restrict__ out, // [..., d]
++    const scalar_t* __restrict__ input, // [..., 2, d]
++    const int d,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
++       idx += item_ct1.get_local_range(2)) {
++    const scalar_t x = input[token_idx * 2 * d + idx];
++    const scalar_t y = input[token_idx * 2 * d + d + idx];
++    out[token_idx * d + idx] = gelu_tanh_xpu(x) * y;
++  }
++}
++
++
++template <typename scalar_t>
++void call_silu_and_mul_kernel(
++    int num_tokens,
++    int d,
++    const scalar_t* __restrict__ input,
++    scalar_t* __restrict__ output) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          silu_and_mul_kernel<sycl_t>(
++              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
++        });
++  });
++}
++
++template <typename scalar_t>
++void call_gelu_and_mul_kernel(
++    int num_tokens,
++    int d,
++    const scalar_t* __restrict__ input,
++    scalar_t* __restrict__ output) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          gelu_and_mul_kernel<sycl_t>(
++              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
++        });
++  });
++}
++
++template <typename scalar_t>
++void call_gelu_tanh_and_mul_kernel(
++    int num_tokens,
++    int d,
++    const scalar_t* __restrict__ input,
++    scalar_t* __restrict__ output) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          gelu_tanh_and_mul_kernel<sycl_t>(
++              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
++        });
++  });
++}
++
++void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
++  int num_tokens = input.numel() / input.size(-1);
++  int d = input.size(-1) / 2;
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_silu_and_mul_kernel", [&] {
++        call_silu_and_mul_kernel(
++            num_tokens,
++            d,
++            input.data_ptr<scalar_t>(),
++            out.data_ptr<scalar_t>());
++      });
++}
++
++// Element-wise activation kernel template.
++template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
++void activation_kernel(
++    scalar_t* __restrict__ out, // [..., d]
++    const scalar_t* __restrict__ input, // [..., d]
++    const int d,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
++       idx += item_ct1.get_local_range(2)) {
++    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
++    out[token_idx * d + idx] = ACT_FN(x);
++  }
++}
++
++template <typename T>
++__inline__ T gelu_new_kernel(const T& x) {
++  const float x3 = (float)(x * x * x);
++  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
++  return ((T)0.5) * x * (((T)1.0) + t);
++}
++
++template <typename T>
++__inline__ T gelu_fast_kernel(const T& x) {
++  const float f = (float)x;
++  const T t =
++      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
++  return ((T)0.5) * x * (((T)1.0) + t);
++}
++
++template <typename scalar_t>
++void call_gelu_new_activation_kernel(torch::Tensor& out, torch::Tensor& input) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int d = input.size(-1);
++  int64_t num_tokens = input.numel() / d;
++  auto out_ptr = out.data_ptr<scalar_t>();
++  auto input_ptr = input.data_ptr<scalar_t>();
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          activation_kernel<sycl_t, gelu_new_kernel>(
++              (sycl_t* __restrict__)out_ptr,
++              (const sycl_t* __restrict__)input_ptr,
++              d,
++              item_ct1);
++        });
++  });
++}
++
++template <typename scalar_t>
++void call_gelu_fast_activation_kernel(
++    torch::Tensor& out,
++    torch::Tensor& input) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int d = input.size(-1);
++  int64_t num_tokens = input.numel() / d;
++  auto out_ptr = out.data_ptr<scalar_t>();
++  auto input_ptr = input.data_ptr<scalar_t>();
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          activation_kernel<sycl_t, gelu_fast_kernel>(
++              (sycl_t* __restrict__)out_ptr,
++              (const sycl_t* __restrict__)input_ptr,
++              d,
++              item_ct1);
++        });
++  });
++}
++
++void gelu_new(torch::Tensor& out, torch::Tensor& input) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      out.scalar_type(), "call_gelu_new_activation_kernel", [&] {
++        call_gelu_new_activation_kernel<scalar_t>(out, input);
++      });
++}
++
++void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      out.scalar_type(), "call_gelu_fast_activation_kernel", [&] {
++        call_gelu_fast_activation_kernel<scalar_t>(
++            out, input);
++      });
++}
++
++void gelu_and_mul(
++  torch::Tensor& out,      // [..., d]
++  torch::Tensor& input)    // [..., 2 * d]
++{
++    int num_tokens = input.numel() / input.size(-1);
++  int d = input.size(-1) / 2;
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_gelu_and_mul_kernel", [&] {
++        call_gelu_and_mul_kernel(
++            num_tokens,
++            d,
++            input.data_ptr<scalar_t>(),
++            out.data_ptr<scalar_t>());
++      });
++}
++
++void gelu_tanh_and_mul(
++  torch::Tensor& out,      // [..., d]
++  torch::Tensor& input)    // [..., 2 * d]
++{
++    int num_tokens = input.numel() / input.size(-1);
++  int d = input.size(-1) / 2;
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_gelu_tanh_and_mul_kernel", [&] {
++        call_gelu_tanh_and_mul_kernel(
++            num_tokens,
++            d,
++            input.data_ptr<scalar_t>(),
++            out.data_ptr<scalar_t>());
++      });
++}
+\ No newline at end of file
+diff --git a/csrc/xpu/attention_generic.h b/csrc/xpu/attention_generic.h
+new file mode 100644
+index 000000000..ab3688c82
+--- /dev/null
++++ b/csrc/xpu/attention_generic.h
+@@ -0,0 +1,64 @@
++/*
++ * Copyright (c) 2023, The vLLM team.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#pragma once
++
++#include <dpct/dpct.hpp>
++#include <stdint.h>
++#include <sycl/sycl.hpp>
++
++namespace vllm {
++
++// A vector type to store Q, K, V elements.
++template <typename T, int VEC_SIZE>
++struct Vec {};
++
++// A vector type to store FP32 accumulators.
++template <typename T>
++struct FloatVec {};
++
++// Template vector operations.
++template <typename Acc, typename A, typename B>
++inline Acc mul(A a, B b);
++
++template <typename T>
++inline float sum(T v);
++
++template <typename T>
++inline float dot(T a, T b) {
++  return sum(mul<T, T, T>(a, b));
++}
++
++template <typename A, typename T>
++inline float dot(T a, T b) {
++  return sum(mul<A, T, T>(a, b));
++}
++
++template <typename T>
++inline void zero(T& dst) {
++  constexpr int WORDS = (sizeof(T) / 4) == 0 ? 1 : (sizeof(T) / 4);
++  union {
++    T raw;
++    uint32_t words[WORDS];
++  } tmp;
++
++#pragma unroll
++  for (int ii = 0; ii < WORDS; ++ii) {
++    tmp.words[ii] = 0u;
++  }
++  dst = tmp.raw;
++}
++
++} // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/xpu/attention_xpu.cpp b/csrc/xpu/attention_xpu.cpp
+new file mode 100644
+index 000000000..97d5c0c21
+--- /dev/null
++++ b/csrc/xpu/attention_xpu.cpp
+@@ -0,0 +1,3031 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++#include <ext/intel/esimd.hpp>
++
++// clang-format on
++#include <float.h>
++#include <torch/extension.h>
++#include <stdexcept>
++#include "utils.h"
++#include "xpu_types.h"
++// #include "dtype_bfloat16.dp.hpp"
++#include "dtype_float16.h"
++#include "dtype_float32.h"
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++#include <c10/xpu/XPUStream.h>
++#endif
++
++#include <functional>
++// #include <ipex.h>
++
++#define WARP_SIZE 32
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define DIVIDE_ROUND_UP(a, b) (((a) + (b)-1) / (b))
++using namespace sycl::ext::intel::esimd;
++
++template<typename T>
++static inline T attn_softcapping(T qk, float attn_logit_softcapping) {
++    qk = qk / attn_logit_softcapping;
++    qk = (sycl::exp(qk) - sycl::exp(-qk)) / (sycl::exp(qk) + sycl::exp(-qk));
++    qk = qk * attn_logit_softcapping;
++    return qk;
++}
++
++template <typename T>
++struct Float_Trait {
++  using Type = T;
++};
++
++template <>
++struct Float_Trait<c10::Half> {
++  using Type = uint16_t;
++};
++
++template <>
++struct Float_Trait<c10::BFloat16> {
++  using Type = sycl::ext::oneapi::bfloat16;
++};
++
++namespace vllm {
++
++// Q*K^T operation.
++template <int THREAD_GROUP_SIZE, typename Vec, int N>
++inline float qk_dot_(
++    const Vec* q,
++    const Vec* k,
++    const sycl::nd_item<3>& item_ct1) {
++  using A_vec = typename FloatVec<Vec>::Type;
++  // Compute the parallel products for Q*K^T (treat vector lanes separately).
++  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
++#pragma unroll
++  for (int ii = 1; ii < N; ++ii) {
++    qk_vec = fma(q[ii], k[ii], qk_vec);
++  }
++
++  // Finalize the reduction across lanes.
++  float qk = sum(qk_vec);
++#pragma unroll
++  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
++    
++    qk += dpct::permute_sub_group_by_xor(
++        item_ct1.get_sub_group(), qk, mask);
++  }
++  return qk;
++}
++
++template <typename T, int THREAD_GROUP_SIZE>
++struct Qk_dot {
++  template <typename Vec, int N>
++  static inline float dot(
++      const Vec* q,
++      const Vec* k,
++      const sycl::nd_item<3>& item_ct1) {
++    return qk_dot_<THREAD_GROUP_SIZE, Vec, N>(q, k, item_ct1);
++  }
++};
++
++template <int NUM_WARPS>
++inline float block_sum(
++    float* red_smem,
++    float sum,
++    const sycl::nd_item<3>& item_ct1) {
++  // Decompose the thread index into warp / lane.
++  int warp = item_ct1.get_local_id(2) / WARP_SIZE;
++  int lane = item_ct1.get_local_id(2) % WARP_SIZE;
++
++  // Compute the sum per warp.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++    
++    /*
++    DPCT1096:42: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    sum += dpct::permute_sub_group_by_xor(
++        item_ct1.get_sub_group(), sum, mask);
++  }
++
++  // Warp leaders store the data to shared memory.
++  if (lane == 0) {
++    red_smem[warp] = sum;
++  }
++
++  // Make sure the data is in shared memory.
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  // The warps compute the final sums.
++  if (lane < NUM_WARPS) {
++    sum = red_smem[lane];
++  }
++
++  // Parallel reduction inside the warp.
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    
++    /*
++    DPCT1096:43: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    sum += dpct::permute_sub_group_by_xor(
++        item_ct1.get_sub_group(), sum, mask);
++  }
++
++  // Broadcast to other threads.
++  
++  /*
++  DPCT1096:44: The right-most dimension of the work-group used in the SYCL
++  kernel that calls this function may be less than "32". The function
++  "dpct::select_from_sub_group" may return an unexpected result on the CPU
++  device. Modify the size of the work-group to ensure that the value of the
++  right-most dimension is a multiple of "32".
++  */
++  return dpct::select_from_sub_group(
++        item_ct1.get_sub_group(), sum, 0);
++}
++
++template <typename scalar_t, int GS, int HD>
++void context_attention_kernel_v1_reshaped(
++    void* query, void* key, void* value, const void* block_tables,
++    const float scale, const void* query_start_loc, const void* seq_lens,
++    const void* context_lens, const int block_size,
++    // const int x,  // x in kv_cache
++    void* out,    // output
++    const int block_table_stride_batch, const int block_table_stride_seq,
++    const int query_stride_bs, const int query_stride_head,
++    const int query_stride_dim, const int k_cache_stride_tokens,
++    const int k_cache_stride_head, const int k_cache_stride_block_size,
++    const int k_cache_stride_dim,
++    const int v_cache_stride_tokens, const int v_cache_stride_head,
++    const int v_cache_stride_block_size, const int v_cache_stride_dim,
++    const int out_stride_tokens, const int out_stride_head,
++    const int num_queries_per_kv, const int max_input_length,
++    const int batch_size, const int num_heads) {
++  static_assert(GS * HD * sizeof(scalar_t) * 2 < 64 * 1024);
++
++  const size_t key_slm_offset = 0;
++  const size_t value_slm_offset = GS * HD * sizeof(scalar_t);
++  sycl::queue& queue = vllm::xpu::vllmGetQueue();
++
++  // Get the maximum seq_lens
++  sycl::range<3> global_size(batch_size, num_heads,
++                             (max_input_length + GS - 1) / GS * GS);
++  sycl::range<3> local_size(1, 1, GS);
++
++  auto cgf = [&](sycl::handler& handle) {
++    handle.parallel_for(
++        sycl::nd_range<3>(global_size, local_size),
++        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++          slm_init<GS * HD * sizeof(scalar_t) * 2>();
++
++          const size_t bsz_idx = item.get_global_id(0);
++          const size_t head_idx = item.get_global_id(1);
++          // Assuming we have 32 query head and 8 kv_heads. Then
++          // num_queries_per_group should be 4 For head_idx 13, then
++          // kv_head_idx = 13 / 4 = 3, which is correct
++          const size_t kv_head_idx = head_idx / num_queries_per_kv;
++          const int32_t seq_idx = item.get_global_id(2);
++          const size_t gid = item.get_group(2);
++          const size_t tid = item.get_local_id(2);
++
++          // const int64_t * seq_len = (const int64_t *) seq_lens;
++          const int32_t* seq_len = (const int32_t*)seq_lens;
++          int32_t seq_bound = seq_len[bsz_idx];
++
++          const int32_t* query_loc = (const int32_t*)query_start_loc;
++          // There is a possibility that the current token index pass
++          // over the seq_len, therefore: token_idx is the position in
++          // the query
++          int32_t token_idx =
++              query_loc[bsz_idx] + std::min(seq_idx, seq_bound - 1);
++
++          const int32_t* context_len_pointer = (const int32_t*)context_lens;
++
++          const int* block_tables_ptr = (const int*)block_tables;
++          const int* block_table =
++              block_tables_ptr + bsz_idx * block_table_stride_batch;
++          // I guess this context_len should be 0...
++          const int32_t context_len = context_len_pointer[bsz_idx];
++
++          // Position in the sequence
++          // context + seq_idx
++          // const int32_t token_position =
++          //     context_len + std::min(seq_idx, seq_bound - 1);
++          const int32_t token_position = context_len + seq_idx;
++
++          const scalar_t* query_head = (const scalar_t*)query +
++                                       token_idx * query_stride_bs +
++                                       head_idx * query_stride_head;
++          // Target output
++          scalar_t* out_head =
++              (scalar_t*)out +
++              (query_loc[bsz_idx] + seq_idx) * out_stride_tokens +
++              head_idx * out_stride_head;
++
++          int32_t context_groups = context_len / GS;
++
++          // Each token load its query_row
++          simd<scalar_t, HD> query_row =
++              block_load<scalar_t, HD>(query_head) * scale;
++          simd<scalar_t, HD> accv = 0;
++          simd<scalar_t, GS> softmaxv = 0;
++          scalar_t max_attn = -sycl::detail::max_v<scalar_t>();
++
++          // ################# Handle n * GS context part ######################
++          int32_t n = context_len / GS;
++          int32_t context_offset = context_len % GS;
++
++          for (int32_t group = 0; group < n; ++group) {
++            size_t target_key_position = group * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            // Now key shape is [num_blocks, num_heads, block_size, head_dim]
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
++            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t), key_row);
++
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot * v_cache_stride_block_size;
++            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
++            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
++                            value_row);
++            barrier();
++
++            // Calculate QK^T for this group...
++            simd<scalar_t, GS> attnv;
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              attnv[r] = attn;
++            }
++            scalar_t new_max_attn =
++                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
++            scalar_t attn_exp = exp(max_attn - new_max_attn);
++            accv = accv * attn_exp;
++            softmaxv = softmaxv * attn_exp;
++            max_attn = new_max_attn;
++            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
++#pragma unorll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              accv += value_row * attn_expv[r];
++            }
++            softmaxv += attn_expv;
++            barrier();
++          }
++
++          // ########## End for handling context n * GS part ###########
++
++          // ########## Handle n * GS ################
++          for (size_t group = 0; group < gid; ++group) {
++            // 1. begins to load each position's key and value
++            size_t target_key_position = context_len + group * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
++            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
++                            key_row);
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot * v_cache_stride_block_size;
++            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
++            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
++                            value_row);
++            barrier();
++            simd<scalar_t, GS> attnv;
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              attnv[r] = attn;
++            }
++
++            scalar_t new_max_attn =
++                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
++            scalar_t attn_exp = exp(max_attn - new_max_attn);
++            accv = accv * attn_exp;
++
++            softmaxv = softmaxv * attn_exp;
++            max_attn = new_max_attn;
++            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              accv += value_row * attn_expv[r];
++            }
++            softmaxv += attn_expv;
++            barrier();
++          }
++
++          // ######### End of handle n * GS part ##########
++
++          // ################ Handle offset part ####################
++          scalar_t softmax =
++              sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, GS>(
++                  softmaxv);
++
++          // ########### handle context offset ############
++          if (tid < context_offset) {
++            size_t target_key_position = n * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
++            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
++                            key_row);
++
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head +
++                which_slot * v_cache_stride_block_size;
++            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
++            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
++                            value_row);
++          }
++
++          barrier();
++
++          if (token_position < seq_bound) {
++#pragma unroll
++            for (size_t r = 0; r < context_offset; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              if (attn <= max_attn) {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(attn - max_attn);
++                accv += value_row * attn_exp;
++                softmax += attn_exp;
++              } else {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(max_attn - attn);
++                accv = accv * attn_exp + value_row;
++                softmax = softmax * attn_exp + 1;
++                max_attn = attn;
++              }
++            }
++          }
++          barrier();
++
++          // ############## handle seq offset #################
++          if (token_position < seq_bound) {
++            const int64_t which_block =
++                static_cast<int64_t>(token_position / block_size);
++            const int64_t which_slot =
++                static_cast<int64_t>(token_position % block_size);
++
++            const int64_t physical_block_number =
++                static_cast<int64_t>(block_table[which_block]);
++
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            simd<scalar_t, HD> key_row = block_load<scalar_t, HD>(key_head);
++            slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t),
++                            key_row);
++
++            // [num_blocks, num_kv_heads, head_size, block_size]
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head +
++                which_slot * v_cache_stride_block_size;
++            simd<scalar_t, HD> value_row = block_load<scalar_t, HD>(value_head);
++            slm_block_store(value_slm_offset + tid * HD * sizeof(scalar_t),
++                            value_row);
++          }
++          barrier();
++
++          if (token_position < seq_bound) {
++            for (size_t r = 0; r <= tid; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              if (attn <= max_attn) {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(attn - max_attn);
++                accv += value_row * attn_exp;
++                softmax += attn_exp;
++              } else {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(max_attn - attn);
++                accv = accv * attn_exp + value_row;
++                softmax = softmax * attn_exp + 1;
++                max_attn = attn;
++              }
++            }
++
++            if (softmax > 0) {
++              simd<scalar_t, HD> result = accv / softmax;
++              block_store(out_head, result);
++            } else {
++              simd<scalar_t, HD> result = 0;
++              block_store(out_head, result);
++            }
++          }
++          // ######## Ending of handling seq offset ##########
++        });
++  };
++  queue.submit(cgf);
++}
++
++// How about implement a first edition that can be used with non-chunked
++// prefill requests, so that we can make sure the reference for heads is
++// correct
++template <typename scalar_t, int GS, int HD>
++void context_attention_kernel_v1(
++    void* query, void* key, void* value, const void* block_tables,
++    const float scale, const void* query_start_loc, const void* seq_lens,
++    const void* context_lens, const int block_size,
++    const int x,  // x in kv_cache
++    void* out,    // output
++    const int block_table_stride_batch, const int block_table_stride_seq,
++    const int query_stride_bs, const int query_stride_head,
++    const int query_stride_dim, const int k_cache_stride_tokens,
++    const int k_cache_stride_head, const int k_cache_stride_dim,
++    const int k_cache_stride_block_size, const int k_cache_stride_x,
++    const int v_cache_stride_tokens, const int v_cache_stride_head,
++    const int v_cache_stride_dim, const int v_cache_stride_block_size,
++    const int out_stride_tokens, const int out_stride_head,
++    const int num_queries_per_kv, const int max_input_length,
++    const int batch_size, const int num_heads) {
++  static_assert(GS * HD * sizeof(scalar_t) * 2 < 64 * 1024);
++
++  const size_t key_slm_offset = 0;
++  const size_t value_slm_offset = GS * HD * sizeof(scalar_t);
++  sycl::queue& queue = vllm::xpu::vllmGetQueue();
++
++  // Get the maximum seq_lens
++  sycl::range<3> global_size(batch_size, num_heads,
++                             (max_input_length + GS - 1) / GS * GS);
++  sycl::range<3> local_size(1, 1, GS);
++
++  auto cgf = [&](sycl::handler& handle) {
++    handle.parallel_for(
++        sycl::nd_range<3>(global_size, local_size),
++        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++          slm_init<GS * HD * sizeof(scalar_t) * 2>();
++
++          const size_t bsz_idx = item.get_global_id(0);
++          const size_t head_idx = item.get_global_id(1);
++          // Assuming we have 32 query head and 8 kv_heads. Then
++          // num_queries_per_group should be 4 For head_idx 13, then
++          // kv_head_idx = 13 / 4 = 3, which is correct
++          const size_t kv_head_idx = head_idx / num_queries_per_kv;
++          const int32_t seq_idx = item.get_global_id(2);
++          const size_t gid = item.get_group(2);
++          const size_t tid = item.get_local_id(2);
++
++          // const int64_t * seq_len = (const int64_t *) seq_lens;
++          const int32_t* seq_len = (const int32_t*)seq_lens;
++          int32_t seq_bound = seq_len[bsz_idx];
++
++          const int32_t* query_loc = (const int32_t*)query_start_loc;
++          // There is a possibility that the current token index pass
++          // over the seq_len, therefore: token_idx is the position in
++          // the query
++          int32_t token_idx =
++              query_loc[bsz_idx] + std::min(seq_idx, seq_bound - 1);
++
++          const int32_t* context_len_pointer = (const int32_t*)context_lens;
++
++          const int* block_tables_ptr = (const int*)block_tables;
++          const int* block_table =
++              block_tables_ptr + bsz_idx * block_table_stride_batch;
++          // I guess this context_len should be 0...
++          const int32_t context_len = context_len_pointer[bsz_idx];
++
++          // Position in the sequence
++          // context + seq_idx
++          // const int32_t token_position =
++          //     context_len + std::min(seq_idx, seq_bound - 1);
++          const int32_t token_position = context_len + seq_idx;
++
++          // static const CONSTANT char FMT[] =
++          //     "Invoke target function...\n ";
++
++          // sycl::ext::oneapi::experimental::printf(FMT);
++          // static const CONSTANT char FMT[] =
++          //     "GroupID = %6d bsz_idx = %6d seq_len = %6d seq_idx =
++          //     %6d" "local_id = "
++          //     "%6d "
++          //     "token_idx = %6d "
++          //     "context_len = %6d "
++          //     "v_cache_stride_head_dim = %6d "
++          //     "token_position = %6d\n";
++          // sycl::ext::oneapi::experimental::printf(
++          //     FMT, gid, bsz_idx, seq_bound, seq_idx, tid,
++          //     token_idx, context_len, v_cache_stride_dim,
++          //     token_position);
++
++          const scalar_t* query_head = (const scalar_t*)query +
++                                       token_idx * query_stride_bs +
++                                       head_idx * query_stride_head;
++          // Target output
++          scalar_t* out_head =
++              (scalar_t*)out +
++              (query_loc[bsz_idx] + seq_idx) * out_stride_tokens +
++              head_idx * out_stride_head;
++
++          int32_t context_groups = context_len / GS;
++
++          // Each token load its query_row
++          simd<scalar_t, HD> query_row =
++              block_load<scalar_t, HD>(query_head) * scale;
++          simd<scalar_t, HD> accv = 0;
++          simd<scalar_t, GS> softmaxv = 0;
++          scalar_t max_attn = -sycl::detail::max_v<scalar_t>();
++
++          // ################# Handle n * GS context part ######################
++          int32_t n = context_len / GS;
++          int32_t context_offset = context_len % GS;
++
++          for (int32_t group = 0; group < n; ++group) {
++            size_t target_key_position = group * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            for (int i = 0; i < HD / x; i++) {
++              // Load 8 elements, decided by x
++              simd<scalar_t, 8> key_row =
++                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
++              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
++                                  8 * i * sizeof(scalar_t),
++                              key_row);
++            }
++
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot;
++            for (int i = 0; i < HD; i++) {
++              scalar_t temp_value = value_head[i * v_cache_stride_dim];
++              slm_scalar_store<scalar_t>(value_slm_offset +
++                                             tid * HD * sizeof(scalar_t) +
++                                             i * sizeof(scalar_t),
++                                         temp_value);
++            }
++            barrier();
++
++            // Calculate QK^T for this group...
++            simd<scalar_t, GS> attnv;
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              attnv[r] = attn;
++            }
++            scalar_t new_max_attn =
++                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
++            scalar_t attn_exp = exp(max_attn - new_max_attn);
++            accv = accv * attn_exp;
++            softmaxv = softmaxv * attn_exp;
++            max_attn = new_max_attn;
++            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
++#pragma unorll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              accv += value_row * attn_expv[r];
++            }
++            softmaxv += attn_expv;
++            barrier();
++          }
++
++          // ########## End for handling context n * GS part ###########
++
++          // ########## Handle n * GS ################
++          for (size_t group = 0; group < gid; ++group) {
++            // 1. begins to load each position's key and value
++            size_t target_key_position = context_len + group * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            for (int i = 0; i < HD / x; i++) {
++              // Load 8 elements
++              simd<scalar_t, 8> key_row =
++                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
++              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
++                                  8 * i * sizeof(scalar_t),
++                              key_row);
++            }
++
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot;
++            for (int i = 0; i < HD; i++) {
++              scalar_t temp_value = value_head[i * v_cache_stride_dim];
++              slm_scalar_store<scalar_t>(value_slm_offset +
++                                             tid * HD * sizeof(scalar_t) +
++                                             i * sizeof(scalar_t),
++                                         temp_value);
++            }
++            barrier();
++            simd<scalar_t, GS> attnv;
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              attnv[r] = attn;
++            }
++
++            scalar_t new_max_attn =
++                std::max(hmax<scalar_t, scalar_t, GS>(attnv), max_attn);
++            scalar_t attn_exp = exp(max_attn - new_max_attn);
++            accv = accv * attn_exp;
++
++            softmaxv = softmaxv * attn_exp;
++            max_attn = new_max_attn;
++            const simd<scalar_t, GS> attn_expv = exp(attnv - max_attn);
++#pragma unroll
++            for (size_t r = 0; r < GS; ++r) {
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              accv += value_row * attn_expv[r];
++            }
++            softmaxv += attn_expv;
++            barrier();
++          }
++
++          // ######### End of handle n * GS part ##########
++
++          // ################ Handle offset part ####################
++          scalar_t softmax =
++              sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, GS>(
++                  softmaxv);
++
++          // ########### handle context offset ############
++          if (tid < context_offset) {
++            size_t target_key_position = n * GS + tid;
++            int which_block = target_key_position / block_size;
++            int which_slot = target_key_position % block_size;
++
++            int physical_block_number = block_table[which_block];
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++            for (int i = 0; i < HD / x; i++) {
++              // Load 8 elements
++              simd<scalar_t, 8> key_row =
++                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
++              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
++                                  8 * i * sizeof(scalar_t),
++                              key_row);
++            }
++
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot;
++            for (int i = 0; i < HD; i++) {
++              // Seems to have an error here
++              scalar_t temp_value = value_head[i * v_cache_stride_dim];
++              slm_scalar_store<scalar_t>(value_slm_offset +
++                                             tid * HD * sizeof(scalar_t) +
++                                             i * sizeof(scalar_t),
++                                         temp_value);
++            }
++          }
++
++          barrier();
++
++          if (token_position < seq_bound) {
++#pragma unroll
++            for (size_t r = 0; r < context_offset; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              if (attn <= max_attn) {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(attn - max_attn);
++                accv += value_row * attn_exp;
++                softmax += attn_exp;
++              } else {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(max_attn - attn);
++                accv = accv * attn_exp + value_row;
++                softmax = softmax * attn_exp + 1;
++                max_attn = attn;
++              }
++            }
++          }
++          barrier();
++
++          // ############## handle seq offset #################
++          if (token_position < seq_bound) {
++            const int64_t which_block =
++                static_cast<int64_t>(token_position / block_size);
++            const int64_t which_slot =
++                static_cast<int64_t>(token_position % block_size);
++
++            const int64_t physical_block_number =
++                static_cast<int64_t>(block_table[which_block]);
++
++            const scalar_t* key_head =
++                (const scalar_t*)key +
++                physical_block_number * k_cache_stride_tokens +
++                kv_head_idx * k_cache_stride_head +
++                which_slot * k_cache_stride_block_size;
++
++            for (int i = 0; i < HD / x; i++) {
++              // Load 8 elements
++              simd<scalar_t, 8> key_row =
++                  block_load<scalar_t, 8>(key_head + i * k_cache_stride_dim);
++              slm_block_store(key_slm_offset + tid * HD * sizeof(scalar_t) +
++                                  8 * i * sizeof(scalar_t),
++                              key_row);
++            }
++
++            // [num_blocks, num_kv_heads, head_size, block_size]
++            const scalar_t* value_head =
++                (const scalar_t*)value +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head + which_slot;
++            for (int i = 0; i < HD; i++) {
++              scalar_t temp_value = value_head[i * v_cache_stride_dim];
++              slm_scalar_store<scalar_t>(value_slm_offset +
++                                             tid * HD * sizeof(scalar_t) +
++                                             i * sizeof(scalar_t),
++                                         temp_value);
++            }
++          }
++          barrier();
++
++          if (token_position < seq_bound) {
++            for (size_t r = 0; r <= tid; ++r) {
++              simd<scalar_t, HD> key_row = slm_block_load<scalar_t, HD>(
++                  key_slm_offset + r * HD * sizeof(scalar_t));
++              simd<scalar_t, HD> value_row = slm_block_load<scalar_t, HD>(
++                  value_slm_offset + r * HD * sizeof(scalar_t));
++              scalar_t attn =
++                  sycl::ext::intel::esimd::detail::sum<scalar_t, scalar_t, HD>(
++                      query_row * key_row);
++              if (attn <= max_attn) {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(attn - max_attn);
++                accv += value_row * attn_exp;
++                softmax += attn_exp;
++              } else {
++                scalar_t attn_exp =
++                    sycl::ext::intel::esimd::exp(max_attn - attn);
++                accv = accv * attn_exp + value_row;
++                softmax = softmax * attn_exp + 1;
++                max_attn = attn;
++              }
++            }
++
++            if (softmax > 0) {
++              simd<scalar_t, HD> result = accv / softmax;
++              block_store(out_head, result);
++            } else {
++              simd<scalar_t, HD> result = 0;
++              block_store(out_head, result);
++            }
++          }
++          // ######## Ending of handling seq offset ##########
++        });
++  };
++  queue.submit(cgf);
++}
++
++template <typename T, int GS, int HD>
++void context_attention_kernel_v2(
++    void* query, void* key, void* value, const void* block_tables,
++    const float scale, const void* query_start_loc, const void* seq_lens,
++    const void* context_lens, const int block_size,
++    const int x,  // x in kv_cache
++    void* out,    // output
++    const int block_table_stride_batch, const int block_table_stride_seq,
++    const int query_stride_bs, const int query_stride_head,
++    const int query_stride_dim, const int k_cache_stride_tokens,
++    const int k_cache_stride_head, const int k_cache_stride_dim,
++    const int k_cache_stride_block_size, const int k_cache_stride_x,
++    const int v_cache_stride_tokens, const int v_cache_stride_head,
++    const int v_cache_stride_dim, const int v_cache_stride_block_size,
++    const int out_stride_tokens, const int out_stride_head,
++    const int num_queries_per_kv, const int max_input_length,
++    const int batch_size, const int num_heads, const int num_tokens,
++    const int max_context_len, const int max_q_len) {
++  constexpr int BLOCK_SIZE = 8;
++  constexpr int NUM_THREADS = 128;
++  // Each wrap handles one context block, therefore, each thread_group_size is
++  // this.
++  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  // Each query, and key thread_group loads 16 bytes
++  // Assume TGS=4 then 16 / 4 / sizeof(half) = 2
++  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
++  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
++  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
++
++  // Assuming HD = 128, TGS = 2, then 128 / 2 / 2 = 32
++  int num_vecs_per_thread = HD / THREAD_GROUP_SIZE / VEC_SIZE;
++  sycl_t* out_p = reinterpret_cast<sycl_t*>(out);
++  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query);
++  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key);
++  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value);
++  const int* query_loc_ptr = reinterpret_cast<const int*>(query_start_loc);
++  const int* block_tables_ptr = reinterpret_cast<const int*>(block_tables);
++  const int* context_lens_ptr = reinterpret_cast<const int*>(context_lens);
++  const int* seq_lens_ptr = reinterpret_cast<const int*>(seq_lens);
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  int padded_max_context_len =
++      DIVIDE_ROUND_UP(max_context_len + 1 + max_q_len, BLOCK_SIZE) * BLOCK_SIZE;
++  int logits_size = padded_max_context_len * sizeof(float);
++  int outputs_size = (NUM_WARPS / 2) * HD * sizeof(float);
++  // Python-side check in
++  // vllm.worker.worker._check_if_can_support_max_seq_len Keep that in
++  // sync with the logic here!
++  int shared_mem_size = std::max(logits_size, outputs_size);
++  // WARN: we have changed this...
++  sycl::range<3> grid(batch_size, num_heads, max_q_len);
++  // One work-group that is executing on the device
++  sycl::range<3> block(1, 1, NUM_THREADS);
++  sycl::queue& queue = vllm::xpu::vllmGetQueue();
++
++  auto cgf = [&](sycl::handler& handle) {
++    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
++        sycl::range<1>(shared_mem_size), handle);
++    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(
++        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), handle);
++    sycl::local_accessor<float, 1> red_smem_acc_ct1(
++        sycl::range<1>(2 * NUM_WARPS), handle);
++
++    handle.parallel_for(
++        sycl::nd_range<3>(grid * block, block),
++        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
++          const int bsz_idx = item_ct1.get_group(0);
++          const int seq_idx = item_ct1.get_group(2);
++          constexpr bool USE_PARTITIONING = false;
++          int context_len = context_lens_ptr[bsz_idx] + seq_idx;
++          const int seq_len = seq_lens_ptr[bsz_idx];
++          uint8_t* dpct_local = dpct_local_acc_ct1.get_pointer();
++          Q_Vec* q_vecs = q_vecs_acc_ct1.get_pointer();
++          float* red_smem = red_smem_acc_ct1.get_pointer();
++
++          // output_stream << "Original context_len: " <<
++          // context_lens_ptr[bsz_idx] << sycl::endl; output_stream <<
++          // "Batch_idx: " << bsz_idx << " Seq_idx: " << seq_idx
++          //     << " Context_len: " << context_len << " Original context_len: "
++          //     << context_lens_ptr[bsz_idx] << " Seq_len: " << seq_len
++          //     << " Max input length: " << max_input_length
++          //     << sycl::endl;
++          if (context_len >= seq_len) {
++            return;
++          }
++
++          context_len = context_len + 1;
++
++          const int num_context_blocks =
++              DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
++          const int num_blocks_per_partition = num_context_blocks;
++
++          const int start_block_idx = 0;
++          const int end_block_idx =
++              MIN(start_block_idx + num_context_blocks, num_context_blocks);
++
++          const int num_blocks = end_block_idx - start_block_idx;
++          const int start_token_idx = start_block_idx * BLOCK_SIZE;
++          const int end_token_idx =
++              MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
++          const int num_tokens = end_token_idx - start_token_idx;
++          constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++          constexpr int NUM_THREAD_GROUPS =
++              NUM_THREADS /
++              THREAD_GROUP_SIZE;  // Note: This assumes THREAD_GROUP_SIZE
++          constexpr int NUM_TOKENS_PER_THREAD_GROUP =
++              DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
++          constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++          const int thread_idx = item_ct1.get_local_id(2);
++          const int warp_idx = thread_idx / WARP_SIZE;
++          const int lane = thread_idx % WARP_SIZE;
++          const int head_idx = item_ct1.get_group(1);
++          const int num_heads = item_ct1.get_group_range(1);
++          const int kv_head_idx = head_idx / num_queries_per_kv;
++          // TODO: consider alibi_slope later
++          constexpr int NUM_ELEMS_PER_THREAD = HD / THREAD_GROUP_SIZE;
++          constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
++          const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
++          const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
++          const sycl_t* q_ptr =
++              query_ptr + (query_loc_ptr[bsz_idx] + seq_idx) * query_stride_bs +
++              head_idx * HD;
++
++#pragma unroll
++          for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
++               i += NUM_THREAD_GROUPS) {
++            const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
++            q_vecs[thread_group_offset * NUM_VECS_PER_THREAD + i] =
++                *reinterpret_cast<const Q_Vec*>(q_ptr + vec_idx * VEC_SIZE);
++          }
++          // Loaded q_vecs
++          item_ct1.barrier(sycl::access::fence_space::local_space);
++          auto shared_mem = (char*)dpct_local;
++          float* logits = reinterpret_cast<float*>(shared_mem);
++          constexpr int x = 16 / sizeof(sycl_t);
++          float qk_max = -FLT_MAX;
++          const int* block_table =
++              block_tables_ptr + bsz_idx * block_table_stride_batch;
++
++          // Loading key
++          for (int block_idx = start_block_idx + warp_idx;
++               block_idx < end_block_idx; block_idx += NUM_WARPS) {
++            const int64_t physical_block_number =
++                static_cast<int64_t>(block_table[block_idx]);
++            for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
++              const int physical_block_offset =
++                  (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
++              const int token_idx =
++                  block_idx * BLOCK_SIZE + physical_block_offset;
++
++              Q_Vec k_vecs[NUM_VECS_PER_THREAD];
++
++#pragma unroll
++              for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
++                const sycl_t* k_ptr =
++                    key_cache_ptr +
++                    physical_block_number * k_cache_stride_tokens +
++                    kv_head_idx * k_cache_stride_head +
++                    physical_block_offset * x;
++
++                const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
++                const int offset1 = (vec_idx * VEC_SIZE) / x;
++                const int offset2 = (vec_idx * VEC_SIZE) % x;
++                k_vecs[j] = *reinterpret_cast<const Q_Vec*>(
++                    k_ptr + offset1 * BLOCK_SIZE * x + offset2);
++              }
++
++              // Compute dot product.
++              // This includes a reduction across the threads in the
++              // same thread group. Q_Vec_t
++              // q_vec_[NUM_VECS_PER_THREAD] = q_vecs +
++              // thread_group_offset * THREAD_GROUP_SIZE;
++              float qk = scale *
++                         Qk_dot<sycl_t, THREAD_GROUP_SIZE>::template dot<
++                             Q_Vec, NUM_VECS_PER_THREAD>(
++                             q_vecs + thread_group_offset * NUM_VECS_PER_THREAD,
++                             k_vecs, item_ct1);
++
++              if (thread_group_offset == 0) {
++                // Store the partial reductions to shared memory.
++                // NOTE(woosuk): It is required to zero out the
++                // masked logits.
++                const bool mask = token_idx > context_len;
++                logits[token_idx - start_token_idx] = mask ? 0.f : qk;
++                qk_max = mask ? qk_max : sycl::fmax(qk_max, qk);
++              }
++            }
++          }
++#pragma unroll
++          for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
++            /*
++            DPCT1096:38: The right-most dimension of the work-group used
++            in the SYCL kernel that calls this function may be less than
++            "32". The function "dpct::permute_sub_group_by_xor" may
++            return an unexpected result on the CPU device. Modify the
++            size of the work-group to ensure that the value of the
++            right-most dimension is a multiple of "32".
++            */
++            qk_max =
++                sycl::fmax(qk_max, dpct::permute_sub_group_by_xor(
++                                       item_ct1.get_sub_group(), qk_max, mask));
++          }
++          if (lane == 0) {
++            red_smem[warp_idx] = qk_max;
++          }
++          item_ct1.barrier(sycl::access::fence_space::local_space);
++          // TODO(woosuk): Refactor this part.
++          // Get the max qk value for the sequence.
++          qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
++#pragma unroll
++          for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++            /*
++            DPCT1096:39: The right-most dimension of the work-group used
++            in the SYCL kernel that calls this function may be less than
++            "32". The function "dpct::permute_sub_group_by_xor" may
++            return an unexpected result on the CPU device. Modify the
++            size of the work-group to ensure that the value of the
++            right-most dimension is a multiple of "32".
++            */
++            qk_max =
++                sycl::fmax(qk_max, dpct::permute_sub_group_by_xor(
++                                       item_ct1.get_sub_group(), qk_max, mask));
++          }
++          qk_max =
++              dpct::select_from_sub_group(item_ct1.get_sub_group(), qk_max, 0);
++
++          // Get the sum of the exp values.
++          float exp_sum = 0.f;
++          for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++            float val = sycl::exp(logits[i] - qk_max);
++            logits[i] = val;
++            exp_sum += val;
++          }
++          exp_sum =
++              block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum, item_ct1);
++          // Compute softmax.
++          const float inv_sum = 1.f / (exp_sum + 1e-6f);
++#pragma unroll
++          for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++            logits[i] *= inv_sum;
++          }
++
++          item_ct1.barrier(sycl::access::fence_space::local_space);
++          constexpr int V_VEC_SIZE = MIN(16 / sizeof(sycl_t), BLOCK_SIZE);
++          using V_vec = typename Vec<sycl_t, V_VEC_SIZE>::Type;
++          using L_vec = typename Vec<sycl_t, V_VEC_SIZE>::Type;
++          using Float_L_vec = typename FloatVec<L_vec>::Type;
++          constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
++          constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
++          constexpr int NUM_ROWS_PER_THREAD =
++              DIVIDE_ROUND_UP(HD, NUM_ROWS_PER_ITER);
++          // NOTE(woosuk): We use FP32 for the accumulator for better
++          // accuracy.
++          float accs[NUM_ROWS_PER_THREAD];
++#pragma unroll
++          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++            accs[i] = 0.f;
++          }
++
++          sycl_t zero_value;
++          zero(zero_value);
++          for (int block_idx = start_block_idx + warp_idx;
++               block_idx < end_block_idx; block_idx += NUM_WARPS) {
++            // NOTE(woosuk): The block number is stored in int32.
++            // However, we cast it to int64 because int32 can lead to
++            // overflow when this variable is multiplied by large
++            // numbers (e.g., kv_block_stride).
++            const int64_t physical_block_number =
++                static_cast<int64_t>(block_table[block_idx]);
++            const int physical_block_offset =
++                (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
++            const int token_idx =
++                block_idx * BLOCK_SIZE + physical_block_offset;
++            L_vec logits_vec;
++            vllm::from_float(
++                logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx -
++                                                            start_token_idx));
++
++            const sycl_t* v_ptr =
++                value_cache_ptr +
++                physical_block_number * v_cache_stride_tokens +
++                kv_head_idx * v_cache_stride_head;
++#pragma unroll
++            for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++              const int row_idx =
++                  lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++              if (row_idx < HD) {
++                const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
++                V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
++                if (block_idx == num_context_blocks - 1) {
++                  // NOTE(woosuk): When v_vec contains the tokens
++                  // that are out of the context, we should
++                  // explicitly zero out the values since they may
++                  // contain NaNs. See
++                  // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
++                  sycl_t* v_vec_ptr = reinterpret_cast<sycl_t*>(&v_vec);
++#pragma unroll
++                  for (int j = 0; j < V_VEC_SIZE; j++) {
++                    v_vec_ptr[j] =
++                        token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
++                  }
++                }
++                accs[i] += vllm::dot(logits_vec, v_vec);
++              }
++            }
++          }
++      // Perform reduction within each warp.
++#pragma unroll
++          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++            float acc = accs[i];
++#pragma unroll
++            for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
++              /*
++              DPCT1096:41: The right-most dimension of the work-group
++              used in the SYCL kernel that calls this function may be
++              less than "32". The function
++              "dpct::permute_sub_group_by_xor" may return an
++              unexpected result on the CPU device. Modify the size of
++              the work-group to ensure that the value of the
++              right-most dimension is a multiple of "32".
++              */
++              acc += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(),
++                                                    acc, mask);
++            }
++            accs[i] = acc;
++          }
++
++          // NOTE(woosuk): A barrier is required because the shared memory
++          // space for logits is reused for the output.
++
++          item_ct1.barrier(sycl::access::fence_space::local_space);
++
++          // Perform reduction across warps.
++          float* out_smem = reinterpret_cast<float*>(shared_mem);
++#pragma unroll
++          for (int i = NUM_WARPS; i > 1; i /= 2) {
++            int mid = i / 2;
++            // Upper warps write to shared memory.
++            if (warp_idx >= mid && warp_idx < i) {
++              float* dst = &out_smem[(warp_idx - mid) * HD];
++#pragma unroll
++              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++                const int row_idx =
++                    lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++                if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
++                  dst[row_idx] = accs[i];
++                }
++              }
++            }
++
++            item_ct1.barrier(sycl::access::fence_space::local_space);
++
++            // Lower warps update the output.
++            if (warp_idx < mid) {
++              const float* src = &out_smem[warp_idx * HD];
++#pragma unroll
++              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++                const int row_idx =
++                    lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++                if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
++                  accs[i] += src[row_idx];
++                }
++              }
++            }
++
++            item_ct1.barrier(sycl::access::fence_space::local_space);
++          }
++
++          // Write the final output.
++          if (warp_idx == 0) {
++            sycl_t* out_ptr =
++                out_p + (query_loc_ptr[bsz_idx] + seq_idx) * out_stride_tokens +
++                head_idx * out_stride_head;
++
++#pragma unroll
++            for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++              const int row_idx =
++                  lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++              if (row_idx < HD && lane % NUM_V_VECS_PER_ROW == 0) {
++                vllm::from_float(*(out_ptr + row_idx), accs[i]);
++              }
++            }
++          }
++        });
++    // Each thread_group handles one token
++  };
++  queue.submit(cgf);
++}
++
++template <
++    typename scalar_t,
++    typename Q_Vec_t,
++    int HEAD_SIZE,
++    int BLOCK_SIZE,
++    int NUM_THREADS,
++    int VEC_SIZE,
++    int PARTITION_SIZE = 0> // Zero means no partitioning.
++void paged_attention_kernel(
++    float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
++    scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
++                                // head_size]
++    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
++    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads, // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ context_lens, // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes, // [num_heads]
++    const int q_stride,
++    const int kv_block_stride,
++    const int kv_head_stride,
++    const float attn_logit_softcapping,
++    const sycl::nd_item<3>& item_ct1,
++    uint8_t* dpct_local,
++    Q_Vec_t* q_vecs,
++    float* red_smem) {
++  const int seq_idx = item_ct1.get_group(1);
++  const int partition_idx = item_ct1.get_group(0);
++  const int max_num_partitions = item_ct1.get_group_range(0);
++  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
++  const int context_len = context_lens[seq_idx];
++  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
++    // No work to do. Terminate the thread block.
++    return;
++  }
++
++  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
++  const int num_blocks_per_partition =
++      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
++
++  // [start_block_idx, end_block_idx) is the range of blocks to process.
++  const int start_block_idx =
++      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
++  const int end_block_idx =
++      MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
++  const int num_blocks = end_block_idx - start_block_idx;
++
++  // [start_token_idx, end_token_idx) is the range of tokens to process.
++  const int start_token_idx = start_block_idx * BLOCK_SIZE;
++  const int end_token_idx =
++      MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
++  const int num_tokens = end_token_idx - start_token_idx;
++
++  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  constexpr int NUM_THREAD_GROUPS =
++      NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
++                                       // divides NUM_THREADS
++  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
++  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
++      DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  const int thread_idx = item_ct1.get_local_id(2);
++  const int warp_idx = thread_idx / WARP_SIZE;
++  const int lane = thread_idx % WARP_SIZE;
++
++  const int head_idx = item_ct1.get_group(2);
++  const int num_heads = item_ct1.get_group_range(2);
++  const int num_queries_per_kv = num_heads / num_kv_heads;
++
++  const int kv_head_idx = head_idx / num_queries_per_kv;
++  ;
++  const float alibi_slope =
++      alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
++
++  // A vector type to store a part of a key or a query.
++  // The vector size is configured in such a way that the threads in a thread
++  // group fetch or compute 16 bytes at a time. For example, if the size of a
++  // thread group is 4 and the data type is half, then the vector size is 16 /
++  // (4 * sizeof(half)) == 2.
++
++  // constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)),
++  // 1);
++
++  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
++  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
++
++  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
++  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
++
++  // Load the query to registers.
++  // Each thread in a thread group has a different part of the query.
++  // For example, if the the thread group size is 4, then the first thread in
++  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
++  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
++  // q is split from a qkv tensor, it may not be contiguous.
++  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
++
++#pragma unroll
++  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
++       i += NUM_THREAD_GROUPS) {
++    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
++    q_vecs[thread_group_offset * NUM_VECS_PER_THREAD + i] =
++        *reinterpret_cast<const Q_Vec_t*>(q_ptr + vec_idx * VEC_SIZE);
++  }
++  /*
++  DPCT1065:5: Consider replacing sycl::nd_item::barrier() with
++  sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
++  performance if there is no access to global memory.
++  */
++  item_ct1.barrier(sycl::access::fence_space::local_space); // TODO(naed90): possible speedup if this is replaced with
++                      // a memory wall right before we use q_vecs
++
++  // Memory planning.
++  auto shared_mem = (char*)dpct_local;
++  // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy.
++  float* logits = reinterpret_cast<float*>(shared_mem);
++  // Workspace for reduction.
++
++  // x == THREAD_GROUP_SIZE * VEC_SIZE
++  // Each thread group fetches x elements from the key at a time.
++  constexpr int x = 16 / sizeof(scalar_t);
++  float qk_max = -FLT_MAX;
++
++  // Iterate over the key blocks.
++  // Each warp fetches a block of keys for each iteration.
++  // Each thread group in a warp fetches a key from the block, and computes
++  // dot product with the query.
++  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
++
++  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
++       block_idx += NUM_WARPS) {
++    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
++    // int64 because int32 can lead to overflow when this variable is multiplied
++    // by large numbers (e.g., kv_block_stride).
++    const int64_t physical_block_number =
++        static_cast<int64_t>(block_table[block_idx]);
++
++    // Load a key to registers.
++    // Each thread in a thread group has a different part of the key.
++    // For example, if the the thread group size is 4, then the first thread in
++    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
++    // has 1, 5, 9, ... th vectors of the key, and so on.
++
++    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
++      const int physical_block_offset =
++          (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
++      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
++
++      Q_Vec_t k_vecs[NUM_VECS_PER_THREAD];
++
++#pragma unroll
++      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
++        const scalar_t* k_ptr = k_cache +
++            physical_block_number * kv_block_stride +
++            kv_head_idx * kv_head_stride + physical_block_offset * x;
++
++        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
++        const int offset1 = (vec_idx * VEC_SIZE) / x;
++        const int offset2 = (vec_idx * VEC_SIZE) % x;
++        k_vecs[j] = *reinterpret_cast<const Q_Vec_t*>(
++            k_ptr + offset1 * BLOCK_SIZE * x + offset2);
++      }
++
++      // Compute dot product.
++      // This includes a reduction across the threads in the same thread group.
++      // Q_Vec_t q_vec_[NUM_VECS_PER_THREAD] = q_vecs + thread_group_offset *
++      // THREAD_GROUP_SIZE;
++      float qk = scale *
++          Qk_dot<scalar_t, THREAD_GROUP_SIZE>::
++              template dot<Q_Vec_t, NUM_VECS_PER_THREAD>(
++                     q_vecs + thread_group_offset * NUM_VECS_PER_THREAD,
++                     k_vecs,
++                     item_ct1);
++      // Add the ALiBi bias if slopes are given.
++      qk +=
++          (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
++
++      // Add the attn_logit_softcapp if given.
++      if (attn_logit_softcapping != 0.0) {
++          qk = attn_softcapping(qk, attn_logit_softcapping);
++      }
++      if (thread_group_offset == 0) {
++        // Store the partial reductions to shared memory.
++        // NOTE(woosuk): It is required to zero out the masked logits.
++        const bool mask = token_idx >= context_len;
++        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
++        // Update the max value.
++        qk_max = mask ? qk_max : sycl::fmax(qk_max, qk);
++      }
++    }
++  }
++
++  // Perform reduction across the threads in the same warp to get the
++  // max qk value for each "warp" (not across the thread block yet).
++  // The 0-th thread of each thread group already has its max qk value.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
++  
++    /*
++    DPCT1096:38: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    qk_max = sycl::fmax(
++        qk_max,
++        dpct::permute_sub_group_by_xor(
++            item_ct1.get_sub_group(), qk_max, mask));
++  }
++  if (lane == 0) {
++    red_smem[warp_idx] = qk_max;
++  }
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  // TODO(woosuk): Refactor this part.
++  // Get the max qk value for the sequence.
++  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    
++    /*
++    DPCT1096:39: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    qk_max = sycl::fmax(
++        qk_max,
++        dpct::permute_sub_group_by_xor(
++            item_ct1.get_sub_group(), qk_max, mask));
++  }
++  // Broadcast the max qk value to all threads.
++  
++  /*
++  DPCT1096:40: The right-most dimension of the work-group used in the SYCL
++  kernel that calls this function may be less than "32". The function
++  "dpct::select_from_sub_group" may return an unexpected result on the CPU
++  device. Modify the size of the work-group to ensure that the value of the
++  right-most dimension is a multiple of "32".
++  */
++  qk_max = dpct::select_from_sub_group(
++          item_ct1.get_sub_group(), qk_max, 0);
++
++  // Get the sum of the exp values.
++  float exp_sum = 0.f;
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++    float val = sycl::exp(logits[i] - qk_max);
++    logits[i] = val;
++    exp_sum += val;
++  }
++  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum, item_ct1);
++
++  // Compute softmax.
++  const float inv_sum = 1.f / (exp_sum + 1e-6f);
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++    logits[i] *= inv_sum;
++  }
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  // If partitioning is enabled, store the max logit and exp_sum.
++  if (USE_PARTITIONING && thread_idx == 0) {
++    float* max_logits_ptr = max_logits +
++        seq_idx * num_heads * max_num_partitions +
++        head_idx * max_num_partitions + partition_idx;
++    *max_logits_ptr = qk_max;
++    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
++        head_idx * max_num_partitions + partition_idx;
++    *exp_sums_ptr = exp_sum;
++  }
++
++  // Each thread will fetch 16 bytes from the value cache at a time.
++  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
++  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
++  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
++  using Float_L_vec = typename FloatVec<L_vec>::Type;
++
++  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
++  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
++  constexpr int NUM_ROWS_PER_THREAD =
++      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
++
++  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
++  float accs[NUM_ROWS_PER_THREAD];
++#pragma unroll
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++    accs[i] = 0.f;
++  }
++
++  scalar_t zero_value;
++  zero(zero_value);
++  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
++       block_idx += NUM_WARPS) {
++    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
++    // int64 because int32 can lead to overflow when this variable is multiplied
++    // by large numbers (e.g., kv_block_stride).
++    const int64_t physical_block_number =
++        static_cast<int64_t>(block_table[block_idx]);
++    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
++    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
++    L_vec logits_vec;
++    vllm::from_float(
++        logits_vec,
++        *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
++
++    const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
++        kv_head_idx * kv_head_stride;
++#pragma unroll
++    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++      if (row_idx < HEAD_SIZE) {
++        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
++        V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
++        if (block_idx == num_context_blocks - 1) {
++          // NOTE(woosuk): When v_vec contains the tokens that are out of the
++          // context, we should explicitly zero out the values since they may
++          // contain NaNs. See
++          // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
++          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
++#pragma unroll
++          for (int j = 0; j < V_VEC_SIZE; j++) {
++            v_vec_ptr[j] =
++                token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
++          }
++        }
++        accs[i] += vllm::dot(logits_vec, v_vec);
++      }
++    }
++  }
++
++  // Perform reduction within each warp.
++#pragma unroll
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++    float acc = accs[i];
++#pragma unroll
++    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
++     
++      /*
++      DPCT1096:41: The right-most dimension of the work-group used in the SYCL
++      kernel that calls this function may be less than "32". The function
++      "dpct::permute_sub_group_by_xor" may return an unexpected result on the
++      CPU device. Modify the size of the work-group to ensure that the value of
++      the right-most dimension is a multiple of "32".
++      */
++      acc += dpct::permute_sub_group_by_xor(
++          item_ct1.get_sub_group(), acc, mask);
++    }
++    accs[i] = acc;
++  }
++
++  // NOTE(woosuk): A barrier is required because the shared memory space for
++  // logits is reused for the output.
++
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  // Perform reduction across warps.
++  float* out_smem = reinterpret_cast<float*>(shared_mem);
++#pragma unroll
++  for (int i = NUM_WARPS; i > 1; i /= 2) {
++    int mid = i / 2;
++    // Upper warps write to shared memory.
++    if (warp_idx >= mid && warp_idx < i) {
++      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
++#pragma unroll
++      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++          dst[row_idx] = accs[i];
++        }
++      }
++    }
++    
++    item_ct1.barrier(sycl::access::fence_space::local_space);
++
++    // Lower warps update the output.
++    if (warp_idx < mid) {
++      const float* src = &out_smem[warp_idx * HEAD_SIZE];
++#pragma unroll
++      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++          accs[i] += src[row_idx];
++        }
++      }
++    }
++    
++    item_ct1.barrier(sycl::access::fence_space::local_space);
++  }
++
++  // Write the final output.
++  if (warp_idx == 0) {
++    scalar_t* out_ptr = out +
++        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
++#pragma unroll
++    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++        vllm::from_float(*(out_ptr + row_idx), accs[i]);
++      }
++    }
++  }
++}
++
++// Grid: (num_heads, num_seqs, 1).
++template <
++    typename scalar_t,
++    typename Q_Vec_t,
++    int HEAD_SIZE,
++    int BLOCK_SIZE,
++    int NUM_THREADS,
++    int VEC_SIZE>
++void paged_attention_v1_kernel(
++    scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
++    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
++    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads, // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ context_lens, // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes, // [num_heads]
++    const int q_stride,
++    const int kv_block_stride,
++    const int kv_head_stride,
++    const float attn_logit_softcapping,
++    const sycl::nd_item<3>& item_ct1,
++    uint8_t* dpct_local,
++    Q_Vec_t* q_vecs,
++    float* red_smem) {
++  paged_attention_kernel<
++      scalar_t,
++      Q_Vec_t,
++      HEAD_SIZE,
++      BLOCK_SIZE,
++      NUM_THREADS,
++      VEC_SIZE>(
++      /* exp_sums */ nullptr,
++      /* max_logits */ nullptr,
++      out,
++      q,
++      k_cache,
++      v_cache,
++      num_kv_heads,
++      scale,
++      block_tables,
++      context_lens,
++      max_num_blocks_per_seq,
++      alibi_slopes,
++      q_stride,
++      kv_block_stride,
++      kv_head_stride,
++      attn_logit_softcapping,
++      item_ct1,
++      dpct_local,
++      q_vecs,
++      red_smem);
++}
++
++#define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)      \
++  paged_attention_xpu_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
++      out_ptr,                                                 \
++      query_ptr,                                               \
++      key_cache_ptr,                                           \
++      value_cache_ptr,                                         \
++      num_kv_heads,                                            \
++      scale,                                                   \
++      block_tables_ptr,                                        \
++      context_lens_ptr,                                        \
++      max_num_blocks_per_seq,                                  \
++      alibi_slopes_ptr,                                        \
++      q_stride,                                                \
++      kv_block_stride,                                         \
++      kv_head_stride,                                          \
++      num_seqs,                                                \
++      num_heads,                                               \
++      num_blocks);
++
++#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
++  event = queue.submit([&](sycl::handler& cgh) {                            \
++    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
++        sycl::range<1>(shared_mem_size), cgh);                              \
++    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(                          \
++        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), cgh);      \
++    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
++        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
++                                                                            \
++    auto out_ptr_ct0 = out_ptr;                                             \
++    auto query_ptr_ct1 = query_ptr;                                         \
++    auto key_cache_ptr_ct2 = key_cache_ptr;                                 \
++    auto value_cache_ptr_ct3 = value_cache_ptr;                             \
++    auto scale_ct5 = scale;                                                 \
++    auto block_tables_ptr_ct6 = block_tables_ptr;                           \
++    auto context_lens_ptr_ct7 = context_lens_ptr;                           \
++    auto max_num_blocks_per_seq_ct8 = max_num_blocks_per_seq;               \
++    auto alibi_slopes_ptr_ct9 = alibi_slopes_ptr;                           \
++    auto q_stride_ct10 = q_stride;                                          \
++    auto kv_block_stride_ct11 = kv_block_stride;                            \
++    auto kv_head_stride_ct12 = kv_head_stride;                              \
++    auto attn_logit_softcapping_ct13 = attn_logit_softcapping;              \
++                                                                            \
++    cgh.parallel_for(                                                       \
++        sycl::nd_range<3>(grid * block, block),                             \
++        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
++          paged_attention_v1_kernel<                                        \
++              sycl_t,                                                       \
++              Q_Vec,                                                        \
++              HEAD_SIZE,                                                    \
++              BLOCK_SIZE,                                                   \
++              NUM_THREADS,                                                  \
++              VEC_SIZE>(                                                    \
++              out_ptr_ct0,                                                  \
++              query_ptr_ct1,                                                \
++              key_cache_ptr_ct2,                                            \
++              value_cache_ptr_ct3,                                          \
++              num_kv_heads,                                                 \
++              scale_ct5,                                                    \
++              block_tables_ptr_ct6,                                         \
++              context_lens_ptr_ct7,                                         \
++              max_num_blocks_per_seq_ct8,                                   \
++              alibi_slopes_ptr_ct9,                                         \
++              q_stride_ct10,                                                \
++              kv_block_stride_ct11,                                         \
++              kv_head_stride_ct12,                                          \
++              attn_logit_softcapping_ct13,                                  \
++              item_ct1,                                                     \
++              dpct_local_acc_ct1.get_pointer(),                             \
++              q_vecs_acc_ct1.get_pointer(),                                 \
++              red_smem_acc_ct1.get_pointer());                              \
++        });                                                                 \
++  });
++
++template <typename T, int BLOCK_SIZE, int NUM_THREADS = 512>
++void paged_attention_xpu_v1_impl_launcher(
++    torch::Tensor& out,
++    torch::Tensor& query,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    int num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int max_context_len,
++    const c10::optional<torch::Tensor>& alibi_slopes,
++    const float attn_logit_softcapping) {
++  int num_seqs = query.size(0);
++  int num_heads = query.size(1);
++  int head_size = query.size(2);
++  int max_num_blocks_per_seq = block_tables.size(1);
++  int q_stride = query.stride(0);
++  int kv_block_stride = key_cache.stride(0);
++  int kv_head_stride = key_cache.stride(1);
++
++  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
++  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
++  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
++
++  int num_vecs_per_thread = head_size / THREAD_GROUP_SIZE / VEC_SIZE;
++  assert(head_size % THREAD_GROUP_SIZE == 0);
++
++  // NOTE: alibi_slopes is optional.
++  const float* alibi_slopes_ptr = alibi_slopes
++      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
++      : nullptr;
++
++  sycl_t* out_ptr = reinterpret_cast<sycl_t*>(out.data_ptr());
++  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query.data_ptr());
++  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key_cache.data_ptr());
++  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* context_lens_ptr = context_lens.data_ptr<int>();
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  int padded_max_context_len =
++      DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
++  
++  int logits_size = padded_max_context_len * sizeof(float);
++  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
++  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
++  // Keep that in sync with the logic here!
++  int shared_mem_size = std::max(logits_size, outputs_size);
++
++  sycl::range<3> grid(1, num_seqs, num_heads);
++  sycl::range<3> block(1, 1, NUM_THREADS);
++  sycl::queue& queue = vllm::xpu::vllmGetQueue();
++  sycl::event event;
++
++  switch (head_size) {
++    // NOTE(woosuk): To reduce the compilation time, we only compile for the
++    // head sizes that we use in the model. However, we can easily extend this
++    // to support any head size which is a multiple of 16.
++    case 64:
++      LAUNCH_PAGED_ATTENTION_V1(64);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    case 80:
++      LAUNCH_PAGED_ATTENTION_V1(80);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    case 96:
++      LAUNCH_PAGED_ATTENTION_V1(96);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    case 112:
++      LAUNCH_PAGED_ATTENTION_V1(112);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    case 128:
++      LAUNCH_PAGED_ATTENTION_V1(128);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    case 256:
++      LAUNCH_PAGED_ATTENTION_V1(256);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v1", event);
++#endif
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
++  }
++  // queue.wait();
++}
++
++#define CALL_KERNEL_LAUNCHER(T, BLOCK_SIZE)                  \
++  vllm::paged_attention_xpu_v1_impl_launcher<T, BLOCK_SIZE>( \
++      out,                                                   \
++      query,                                                 \
++      key_cache,                                             \
++      value_cache,                                           \
++      num_kv_heads,                                          \
++      scale,                                                 \
++      block_tables,                                          \
++      context_lens,                                          \
++      max_context_len,                                       \
++      alibi_slopes,                                          \
++      attn_logit_softcapping);
++
++#define CALL_KERNEL_LAUNCHER_BLOCK_SIZE(T)                        \
++  switch (block_size) {                                           \
++    case 8:                                                      \
++      CALL_KERNEL_LAUNCHER(T, 8);                                \
++      break;                                                      \
++    case 16:                                                      \
++      CALL_KERNEL_LAUNCHER(T, 16);                                \
++      break;                                                      \
++    case 32:                                                      \
++      CALL_KERNEL_LAUNCHER(T, 32);                                \
++      break;                                                      \
++    case 64:                                                      \
++      CALL_KERNEL_LAUNCHER(T, 64);                                \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
++  }
++
++// Grid: (num_heads, num_seqs).
++template <
++    typename scalar_t,
++    int HEAD_SIZE,
++    int NUM_THREADS,
++    int PARTITION_SIZE>
++void paged_attention_v2_reduce_kernel(
++    scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
++    const float* __restrict__ exp_sums, // [num_seqs, num_heads,
++                                        // max_num_partitions]
++    const float* __restrict__ max_logits, // [num_seqs, num_heads,
++                                          // max_num_partitions]
++    const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
++                                          // max_num_partitions, head_size]
++    const int* __restrict__ context_lens, // [num_seqs]
++    const int max_num_partitions,
++    const sycl::nd_item<3>& item_ct1,
++    uint8_t* dpct_local,
++    float* red_smem) {
++  const int num_heads = item_ct1.get_group_range(2);
++  const int head_idx = item_ct1.get_group(2);
++  const int seq_idx = item_ct1.get_group(1);
++  const int context_len = context_lens[seq_idx];
++  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
++  if (num_partitions == 1) {
++    // No need to reduce. Only copy tmp_out to out.
++    scalar_t* out_ptr =
++        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
++    const scalar_t* tmp_out_ptr = tmp_out +
++        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++        head_idx * max_num_partitions * HEAD_SIZE;
++    for (int i = item_ct1.get_local_id(2); i < HEAD_SIZE;
++         i += item_ct1.get_local_range(2)) {
++      out_ptr[i] = tmp_out_ptr[i];
++    }
++    // Terminate the thread block.
++    return;
++  }
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  const int warp_idx = item_ct1.get_local_id(2) / WARP_SIZE;
++  const int lane = item_ct1.get_local_id(2) % WARP_SIZE;
++
++  // Size: 2 * num_partitions.
++  auto shared_mem = (char*)dpct_local;
++  // Workspace for reduction.
++
++  // Load max logits to shared memory.
++  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
++  const float* max_logits_ptr = max_logits +
++      seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions;
++  float max_logit = -FLT_MAX;
++  for (int i = item_ct1.get_local_id(2); i < num_partitions;
++       i += item_ct1.get_local_range(2)) {
++    const float l = max_logits_ptr[i];
++    shared_max_logits[i] = l;
++    max_logit = sycl::fmax(max_logit, (float)l);
++  }
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  // Get the global max logit.
++  // Reduce within the warp.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++    
++    /*
++    DPCT1096:45: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    max_logit = sycl::fmax(
++        max_logit,
++        dpct::permute_sub_group_by_xor(
++            item_ct1.get_sub_group(), max_logit, mask));
++  }
++  if (lane == 0) {
++    red_smem[warp_idx] = max_logit;
++  }
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++  // Reduce across warps.
++  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    
++    /*
++    DPCT1096:46: The right-most dimension of the work-group used in the SYCL
++    kernel that calls this function may be less than "32". The function
++    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
++    device. Modify the size of the work-group to ensure that the value of the
++    right-most dimension is a multiple of "32".
++    */
++    max_logit = sycl::fmax(
++        max_logit,
++        dpct::permute_sub_group_by_xor(
++            item_ct1.get_sub_group(), max_logit, mask));
++  }
++  // Broadcast the max value to all threads.
++  
++  /*
++  DPCT1096:47: The right-most dimension of the work-group used in the SYCL
++  kernel that calls this function may be less than "32". The function
++  "dpct::select_from_sub_group" may return an unexpected result on the CPU
++  device. Modify the size of the work-group to ensure that the value of the
++  right-most dimension is a multiple of "32".
++  */
++  max_logit = dpct::select_from_sub_group(
++      item_ct1.get_sub_group(), max_logit, 0);
++
++  // Load rescaled exp sums to shared memory.
++  float* shared_exp_sums =
++      reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
++  const float* exp_sums_ptr = exp_sums +
++      seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions;
++  float global_exp_sum = 0.0f;
++  for (int i = item_ct1.get_local_id(2); i < num_partitions;
++       i += item_ct1.get_local_range(2)) {
++    float l = shared_max_logits[i];
++    float rescaled_exp_sum = exp_sums_ptr[i] * sycl::exp(l - max_logit);
++    global_exp_sum += rescaled_exp_sum;
++    shared_exp_sums[i] = rescaled_exp_sum;
++  }
++  
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++  global_exp_sum =
++      block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum, item_ct1);
++  const float inv_global_exp_sum = 1.0f / (global_exp_sum + 1e-6f);
++
++  // Aggregate tmp_out to out.
++  const scalar_t* tmp_out_ptr = tmp_out +
++      seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++      head_idx * max_num_partitions * HEAD_SIZE;
++  scalar_t* out_ptr =
++      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
++#pragma unroll
++  for (int i = item_ct1.get_local_id(2); i < HEAD_SIZE; i += NUM_THREADS) {
++    float acc = 0.0f;
++    for (int j = 0; j < num_partitions; ++j) {
++      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
++          inv_global_exp_sum;
++    }
++    from_float(out_ptr[i], acc);
++  }
++}
++
++// Grid: (num_heads, num_seqs, max_num_partitions).
++template <
++    typename scalar_t,
++    typename Q_Vec_t,
++    int HEAD_SIZE,
++    int BLOCK_SIZE,
++    int NUM_THREADS,
++    int VEC_SIZE,
++    int PARTITION_SIZE>
++void paged_attention_v2_kernel(
++    float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
++    scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions,
++                                    // head_size]
++    const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
++    const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads, // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ context_lens, // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes, // [num_heads]
++    const int q_stride,
++    const int kv_block_stride,
++    const int kv_head_stride,
++    const float attn_logit_softcapping,
++    const sycl::nd_item<3>& item_ct1,
++    uint8_t* dpct_local,
++    Q_Vec_t* q_vecs,
++    float* red_smem) {
++  paged_attention_kernel<
++      scalar_t,
++      Q_Vec_t,
++      HEAD_SIZE,
++      BLOCK_SIZE,
++      NUM_THREADS,
++      VEC_SIZE,
++      PARTITION_SIZE>(
++      exp_sums,
++      max_logits,
++      tmp_out,
++      q,
++      k_cache,
++      v_cache,
++      num_kv_heads,
++      scale,
++      block_tables,
++      context_lens,
++      max_num_blocks_per_seq,
++      alibi_slopes,
++      q_stride,
++      kv_block_stride,
++      kv_head_stride,
++      attn_logit_softcapping,
++      item_ct1,
++      dpct_local,
++      q_vecs,
++      red_smem);
++}
++
++#define LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(HEAD_SIZE)                     \
++  event = queue.submit([&](sycl::handler& cgh) {                            \
++    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
++        sycl::range<1>(shared_mem_size), cgh);                              \
++    sycl::local_accessor<Q_Vec, 1> q_vecs_acc_ct1(                          \
++        sycl::range<1>(THREAD_GROUP_SIZE * num_vecs_per_thread), cgh);      \
++    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
++        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
++                                                                            \
++    auto exp_sums_ptr_ct0 = exp_sums_ptr;                                   \
++    auto max_logits_ptr_ct1 = max_logits_ptr;                               \
++    auto tmp_out_ptr_ct2 = tmp_out_ptr;                                     \
++    auto query_ptr_ct3 = query_ptr;                                         \
++    auto key_cache_ptr_ct4 = key_cache_ptr;                                 \
++    auto value_cache_ptr_ct5 = value_cache_ptr;                             \
++    auto scale_ct7 = scale;                                                 \
++    auto block_tables_ptr_ct8 = block_tables_ptr;                           \
++    auto context_lens_ptr_ct9 = context_lens_ptr;                           \
++    auto max_num_blocks_per_seq_ct10 = max_num_blocks_per_seq;              \
++    auto alibi_slopes_ptr_ct11 = alibi_slopes_ptr;                          \
++    auto q_stride_ct12 = q_stride;                                          \
++    auto kv_block_stride_ct13 = kv_block_stride;                            \
++    auto kv_head_stride_ct14 = kv_head_stride;                              \
++    auto attn_logit_softcapping_ct15 = attn_logit_softcapping;              \
++                                                                            \
++    cgh.parallel_for(                                                       \
++        sycl::nd_range<3>(grid * block, block),                             \
++        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
++          vllm::paged_attention_v2_kernel<                                  \
++              sycl_t,                                                       \
++              Q_Vec,                                                        \
++              HEAD_SIZE,                                                    \
++              BLOCK_SIZE,                                                   \
++              NUM_THREADS,                                                  \
++              VEC_SIZE,                                                     \
++              PARTITION_SIZE>(                                              \
++              exp_sums_ptr_ct0,                                             \
++              max_logits_ptr_ct1,                                           \
++              tmp_out_ptr_ct2,                                              \
++              query_ptr_ct3,                                                \
++              key_cache_ptr_ct4,                                            \
++              value_cache_ptr_ct5,                                          \
++              num_kv_heads,                                                 \
++              scale_ct7,                                                    \
++              block_tables_ptr_ct8,                                         \
++              context_lens_ptr_ct9,                                         \
++              max_num_blocks_per_seq_ct10,                                  \
++              alibi_slopes_ptr_ct11,                                        \
++              q_stride_ct12,                                                \
++              kv_block_stride_ct13,                                         \
++              kv_head_stride_ct14,                                          \
++              attn_logit_softcapping_ct15,                                  \
++              item_ct1,                                                     \
++              dpct_local_acc_ct1.get_pointer(),                             \
++              q_vecs_acc_ct1.get_pointer(),                                 \
++              red_smem_acc_ct1.get_pointer());                              \
++        });                                                                 \
++  });
++
++#define LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(HEAD_SIZE)                    \
++  event2 = queue.submit([&](sycl::handler& cgh) {                           \
++    sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(                    \
++        sycl::range<1>(reduce_shared_mem_size), cgh);                       \
++    sycl::local_accessor<float, 1> red_smem_acc_ct1(                        \
++        sycl::range<1>(2 * NUM_WARPS), cgh);                                \
++                                                                            \
++    auto out_ptr_ct0 = out_ptr;                                             \
++    auto exp_sums_ptr_ct1 = exp_sums_ptr;                                   \
++    auto max_logits_ptr_ct2 = max_logits_ptr;                               \
++    auto tmp_out_ptr_ct3 = tmp_out_ptr;                                     \
++    auto context_lens_ptr_ct4 = context_lens_ptr;                           \
++    auto max_num_partitions_ct5 = max_num_partitions;                       \
++                                                                            \
++    cgh.parallel_for(                                                       \
++        sycl::nd_range<3>(reduce_grid * block, block),                      \
++        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \
++          vllm::paged_attention_v2_reduce_kernel<                           \
++              sycl_t,                                                       \
++              HEAD_SIZE,                                                    \
++              NUM_THREADS,                                                  \
++              PARTITION_SIZE>(                                              \
++              out_ptr_ct0,                                                  \
++              exp_sums_ptr_ct1,                                             \
++              max_logits_ptr_ct2,                                           \
++              tmp_out_ptr_ct3,                                              \
++              context_lens_ptr_ct4,                                         \
++              max_num_partitions_ct5,                                       \
++              item_ct1,                                                     \
++              dpct_local_acc_ct1.get_pointer(),                             \
++              red_smem_acc_ct1.get_pointer());                              \
++        });                                                                 \
++  });
++
++template <
++    typename T,
++    int BLOCK_SIZE,
++    int NUM_THREADS = 512,
++    int PARTITION_SIZE = 512>
++void paged_attention_v2_launcher(
++    torch::Tensor& out,
++    torch::Tensor& exp_sums,
++    torch::Tensor& max_logits,
++    torch::Tensor& tmp_out,
++    torch::Tensor& query,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    int num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int max_context_len,
++    const c10::optional<torch::Tensor>& alibi_slopes,
++    const float attn_logit_softcapping) {
++  int num_seqs = query.size(0);
++  int num_heads = query.size(1);
++  int head_size = query.size(2);
++  int max_num_blocks_per_seq = block_tables.size(1);
++  int q_stride = query.stride(0);
++  int kv_block_stride = key_cache.stride(0);
++  int kv_head_stride = key_cache.stride(1);
++
++  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  assert(head_size % THREAD_GROUP_SIZE == 0);
++  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
++  using sycl_t = vllm::xpu::SyclTypeTrait<T>::Type;
++  using Q_Vec = typename Vec<sycl_t, VEC_SIZE>::Type;
++
++  int num_vecs_per_thread = head_size / THREAD_GROUP_SIZE / VEC_SIZE;
++  assert(head_size % THREAD_GROUP_SIZE == 0);
++
++  // NOTE: alibi_slopes is optional.
++  const float* alibi_slopes_ptr = alibi_slopes
++      ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
++      : nullptr;
++
++  sycl_t* out_ptr = reinterpret_cast<sycl_t*>(out.data_ptr());
++  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
++  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
++  sycl_t* tmp_out_ptr = reinterpret_cast<sycl_t*>(tmp_out.data_ptr());
++  sycl_t* query_ptr = reinterpret_cast<sycl_t*>(query.data_ptr());
++  sycl_t* key_cache_ptr = reinterpret_cast<sycl_t*>(key_cache.data_ptr());
++  sycl_t* value_cache_ptr = reinterpret_cast<sycl_t*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* context_lens_ptr = context_lens.data_ptr<int>();
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
++  
++  int logits_size = PARTITION_SIZE * sizeof(float);
++  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
++
++  // For paged attention v2 kernel.
++  sycl::range<3> grid(max_num_partitions, num_seqs, num_heads);
++  int shared_mem_size = std::max(logits_size, outputs_size);
++  // For paged attention v2 reduce kernel.
++  sycl::range<3> reduce_grid(1, num_seqs, num_heads);
++  
++  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
++
++  sycl::range<3> block(1, 1, NUM_THREADS);
++  sycl::queue& queue = vllm::xpu::vllmGetQueue();
++  sycl::event event;
++  sycl::event event2;
++  switch (head_size) {
++    // NOTE(woosuk): To reduce the compilation time, we only compile for the
++    // head sizes that we use in the model. However, we can easily extend this
++    // to support any head size which is a multiple of 16.
++    case 64:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(64);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(64);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    case 80:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(80);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(80);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    case 96:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(96);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(96);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    case 112:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(112);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(112);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    case 128:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(128);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(128);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    case 256:
++      LAUNCH_PAGED_ATTENTION_V2_FIRST_HALF(256);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event);
++#endif
++      LAUNCH_PAGED_ATTENTION_V2_SECOND_HALF(256);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++    // xpu::profiler_record(event_desc, event);  // Uncomment when needed
++#else
++    ::xpu::profiler_record("paged attn v2", event2);
++#endif
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
++  }
++}
++
++#define CALL_V2_LAUNCHER(T, BLOCK_SIZE)             \
++  vllm::paged_attention_v2_launcher<T, BLOCK_SIZE>( \
++      out,                                          \
++      exp_sums,                                     \
++      max_logits,                                   \
++      tmp_out,                                      \
++      query,                                        \
++      key_cache,                                    \
++      value_cache,                                  \
++      num_kv_heads,                                 \
++      scale,                                        \
++      block_tables,                                 \
++      context_lens,                                 \
++      max_context_len,                              \
++      alibi_slopes,                                 \
++      attn_logit_softcapping);
++
++#define CALL_V2_LAUNCHER_BLOCK_SIZE(T)                            \
++  switch (block_size) {                                           \
++    case 8:                                                       \
++      CALL_V2_LAUNCHER(T, 8);                                     \
++      break;                                                      \
++    case 16:                                                      \
++      CALL_V2_LAUNCHER(T, 16);                                    \
++      break;                                                      \
++    case 32:                                                      \
++      CALL_V2_LAUNCHER(T, 32);                                    \
++      break;                                                      \
++    case 64:                                                      \
++      CALL_V2_LAUNCHER(T, 64);                                    \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
++  }
++
++} // namespace vllm
++
++void paged_attention_v1(
++    torch::Tensor& out,
++    torch::Tensor& query,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    int num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int block_size,
++    int max_context_len,
++    const c10::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype,
++    const float kv_scale,
++    const float attn_logit_softcapping) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(
++      query.scalar_type(), "paged_attention_xpu_v1_impl", [&] {
++        CALL_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
++      });
++}
++
++void paged_attention_v2(
++    torch::Tensor& out,
++    torch::Tensor& exp_sums,
++    torch::Tensor& max_logits,
++    torch::Tensor& tmp_out,
++    torch::Tensor& query,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    int num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int block_size,
++    int max_context_len,
++    const c10::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype,
++    const float kv_scale,
++    const float attn_logit_softcapping) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(
++      query.scalar_type(), "paged_attention_xpu_v2_impl", [&] {
++        CALL_V2_LAUNCHER_BLOCK_SIZE(scalar_t);
++      });
++}
++
++torch::Tensor context_attention_forward_v2(
++    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
++    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor block_tables, torch::Tensor query_start_loc,
++    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
++    int max_context_length, int max_q_length) {
++  // Currently, only support fp16 here
++  int64_t num_tokens = query.size(0);
++  int64_t num_heads = query.size(1);
++  int64_t head_dim = query.size(2);
++  int64_t batch_size = seq_lens.size(0);
++  int num_kv_heads = value.size(1);
++
++  int key_dimension = key.dim();
++  auto output = at::empty({query.size(0), query.size(1), query.size(2)},
++                          at::device(query.device()).dtype(query.dtype()));
++
++  assert(key_dimension == 5);
++  assert(query.scalar_type() == key.scalar_type() &&
++         query.scalar_type() == value.scalar_type());
++  assert(head_dim == 128);
++  assert(query.scalar_type() == at::ScalarType::Half);
++
++  int query_stride_token = query.stride(0);
++  int query_stride_head = query.stride(1);
++  int query_stride_dim = query.stride(2);
++  const float attn_scale = 1 / std::sqrt((float)head_dim);
++
++  assert(num_heads % num_kv_heads == 0);
++  int num_queries_per_kv = num_heads / num_kv_heads;
++
++
++  // key: num_blocks, num_kv_heads, head_size // x, num_blocks, x)
++  // value: [num_blocks, num_kv_heads, head_size, block_dim]
++  int block_size = value.size(3);
++  // Currently, only block_size 16 is supported...
++  assert(block_size == 16);
++  int x = key.size(4);
++  int block_table_stride_bsz = block_tables.stride(0);
++  int block_table_stride_seq = block_tables.stride(1);
++  int k_cache_stride_token = key.stride(0);
++  int k_cache_stride_head = key.stride(1);
++  int k_cache_stride_head_dim = key.stride(2);
++  int k_cache_stride_block = key.stride(3);
++  int k_cache_stride_x = key.stride(4);
++
++  int v_cache_stride_token = value.stride(0);
++  int v_cache_stride_head = value.stride(1);
++  int v_cache_stride_head_dim = value.stride(2);
++  int v_cache_stride_block = value.stride(3);
++  switch(head_dim) {
++    case 128:
++      vllm::context_attention_kernel_v2<sycl::half, 32, 128>(
++        query.data_ptr(), key.data_ptr(), value.data_ptr(),
++        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++        query_stride_token, query_stride_head, query_stride_dim,
++        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++        output.stride(0), output.stride(1), num_queries_per_kv,
++        max_input_length, batch_size, num_heads, query.size(0),
++        max_context_length, max_q_length);
++      break;
++    case 64:
++      vllm::context_attention_kernel_v2<sycl::half, 32, 64>(
++        query.data_ptr(), key.data_ptr(), value.data_ptr(),
++        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++        query_stride_token, query_stride_head, query_stride_dim,
++        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++        output.stride(0), output.stride(1), num_queries_per_kv,
++        max_input_length, batch_size, num_heads, query.size(0),
++        max_context_length, max_q_length);
++      break;
++    case 80:
++      vllm::context_attention_kernel_v2<sycl::half, 32, 80>(
++        query.data_ptr(), key.data_ptr(), value.data_ptr(),
++        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++        query_stride_token, query_stride_head, query_stride_dim,
++        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++        output.stride(0), output.stride(1), num_queries_per_kv,
++        max_input_length, batch_size, num_heads, query.size(0),
++        max_context_length, max_q_length);
++      break;
++    case 96:
++      vllm::context_attention_kernel_v2<sycl::half, 32, 96>(
++        query.data_ptr(), key.data_ptr(), value.data_ptr(),
++        block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++        seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++        output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++        query_stride_token, query_stride_head, query_stride_dim,
++        k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++        k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++        v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++        output.stride(0), output.stride(1), num_queries_per_kv,
++        max_input_length, batch_size, num_heads, query.size(0),
++        max_context_length, max_q_length);
++      break;
++    default: throw std::runtime_error("unsupported head_dim");
++  }
++    return output;
++}
++
++torch::Tensor context_attention_forward_v1(
++    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
++    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor block_tables, torch::Tensor query_start_loc,
++    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
++    int max_context_length) {
++  // Currently, only support fp16
++  int64_t num_tokens = query.size(0);
++  int64_t num_heads = query.size(1);
++  int64_t head_dim = query.size(2);
++  int64_t batch_size = seq_lens.size(0);
++  int num_kv_heads = value.size(1);
++
++  int key_dimension = key.dim();
++  auto output = at::empty({query.size(0), query.size(1), query.size(2)},
++                          at::device(query.device()).dtype(query.dtype()));
++
++  // key should be in shape:
++  // 1. [num_blocks, num_heads, block_size, head_dim]
++  // 2. [num_blocks, num_heads, head_dim / x, block_size, x]
++  assert(key_dimension == 4 or key_dimension == 5);
++  assert(query.scalar_type() == key.scalar_type() &&
++         query.scalar_type() == value.scalar_type());
++  assert(query.scalar_type() == at::ScalarType::Half);
++
++  int query_stride_token = query.stride(0);
++  int query_stride_head = query.stride(1);
++  int query_stride_dim = query.stride(2);
++  const float attn_scale = 1 / std::sqrt((float)head_dim);
++
++  assert(num_heads % num_kv_heads == 0);
++  int num_queries_per_kv = num_heads / num_kv_heads;
++  int block_table_stride_bsz = block_tables.stride(0);
++  int block_table_stride_seq = block_tables.stride(1);
++  if (key_dimension == 4) {
++    // key/value: num_blocks, num_kv_heads, num_blocks, head_dim)
++    int block_size = value.size(2);
++    int k_cache_stride_0 = key.stride(0);
++    int k_cache_stride_1 = key.stride(1);
++    int k_cache_stride_2 = key.stride(2);
++    int k_cache_stride_3 = key.stride(3);
++
++    int v_cache_stride_0 = value.stride(0);
++    int v_cache_stride_1 = value.stride(1);
++    int v_cache_stride_2 = value.stride(2);
++    int v_cache_stride_3 = value.stride(3);
++    switch (head_dim) {
++      case 128:
++        vllm::context_attention_kernel_v1_reshaped<sycl::half, 32, 128>(
++            query.data_ptr(), key.data_ptr(), value.data_ptr(),
++            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++            seq_lens.data_ptr(), context_lens.data_ptr(), block_size,
++            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++            query_stride_token, query_stride_head, query_stride_dim,
++            k_cache_stride_0, k_cache_stride_1, k_cache_stride_2,
++            k_cache_stride_3, v_cache_stride_0, v_cache_stride_1,
++            v_cache_stride_2, v_cache_stride_3, output.stride(0),
++            output.stride(1), num_queries_per_kv, max_input_length, batch_size,
++            num_heads);
++        break;
++      case 64:
++        vllm::context_attention_kernel_v1_reshaped<sycl::half, 32, 64>(
++            query.data_ptr(), key.data_ptr(), value.data_ptr(),
++            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++            seq_lens.data_ptr(), context_lens.data_ptr(), block_size,
++            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++            query_stride_token, query_stride_head, query_stride_dim,
++            k_cache_stride_0, k_cache_stride_1, k_cache_stride_2,
++            k_cache_stride_3, v_cache_stride_0, v_cache_stride_1,
++            v_cache_stride_2, v_cache_stride_3, output.stride(0),
++            output.stride(1), num_queries_per_kv, max_input_length, batch_size,
++            num_heads);
++        break;
++      default:
++        throw std::runtime_error("unsupported head_dim");
++    }
++  } else {
++    int x = key.size(4);
++    int block_size = value.size(3);
++    int k_cache_stride_token = key.stride(0);
++    int k_cache_stride_head = key.stride(1);
++    int k_cache_stride_head_dim = key.stride(2);
++    int k_cache_stride_block = key.stride(3);
++    int k_cache_stride_x = key.stride(4);
++
++    int v_cache_stride_token = value.stride(0);
++    int v_cache_stride_head = value.stride(1);
++    int v_cache_stride_head_dim = value.stride(2);
++    int v_cache_stride_block = value.stride(3);
++    switch (head_dim) {
++      case 128:
++        vllm::context_attention_kernel_v1<sycl::half, 32, 128>(
++            query.data_ptr(), key.data_ptr(), value.data_ptr(),
++            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++            seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++            query_stride_token, query_stride_head, query_stride_dim,
++            k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++            k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++            v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++            output.stride(0), output.stride(1), num_queries_per_kv,
++            max_input_length, batch_size, num_heads);
++        break;
++      case 64:
++        vllm::context_attention_kernel_v1<sycl::half, 32, 64>(
++            query.data_ptr(), key.data_ptr(), value.data_ptr(),
++            block_tables.data_ptr(), attn_scale, query_start_loc.data_ptr(),
++            seq_lens.data_ptr(), context_lens.data_ptr(), block_size, x,
++            output.data_ptr(), block_table_stride_bsz, block_table_stride_seq,
++            query_stride_token, query_stride_head, query_stride_dim,
++            k_cache_stride_token, k_cache_stride_head, k_cache_stride_head_dim,
++            k_cache_stride_block, k_cache_stride_x, v_cache_stride_token,
++            v_cache_stride_head, v_cache_stride_head_dim, v_cache_stride_block,
++            output.stride(0), output.stride(1), num_queries_per_kv,
++            max_input_length, batch_size, num_heads);
++        break;
++      default:
++        throw std::runtime_error("unsupported head_dim");
++    }
++  }
++  return output;
++}
++
++template<typename IT, const int VS, const int HD>
++void gqa_1_kernel(
++    const void * query, // [num_seqs, num_heads, head_size]
++    const void * key,   // [num_blocks, num_kv_heads, head_size, block_size]
++    const void * value, // [num_blocks, num_kv_heads, head_size, block_size]
++    const void* block_tables, // [num_seqs, max_num_blocks_per_seq]
++    const void* context_lens, // [num_seqs]
++    void * o_a_s,
++    void * o_accs,
++    const int64_t query_bsz_stride,
++    const int64_t query_head_stride,
++    const int64_t kv_token_stride,
++    const int64_t kv_head_stride,
++    const int64_t kv_block_stride,
++    const int64_t block_table_stride_batch,
++    const int64_t o_a_s_bsz_stride,
++    const int64_t o_a_s_head_stride,
++    const int64_t o_accs_bsz_stride,
++    const int64_t o_accs_head_stride,
++    const float scale,
++    const int block_size,
++    const int bsz,
++    const int num_heads,
++    const int num_kv_heads,
++    const int block_num,
++    const at::Device & device
++) {
++    const int group_size = num_heads / num_kv_heads;
++    const int sub_rows = VS / group_size;
++    const int rem_rows = VS % group_size;
++
++    const float attn_scale = scale;
++
++    sycl::range<3> global_size(bsz, num_heads, block_num);
++    sycl::range<3> local_size(1, group_size, 1);
++
++    auto cgf = [&](sycl::handler& handle) {
++        handle.parallel_for(
++            sycl::nd_range<3>(global_size, local_size),
++            [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++                slm_init<VS * HD * sizeof(IT)>();
++
++                const int bsz_idx = item.get_global_id(0);
++                const int head_idx = item.get_global_id(1);
++                const int kv_head_idx = item.get_group(1);
++                const int tid = item.get_local_id(1);
++                const int vid = item.get_global_id(2);
++
++                const IT * query_head = (const IT *)query + bsz_idx * query_bsz_stride
++                                                          + head_idx * query_head_stride;
++                
++                IT * o_accs_head = (IT *)o_accs + bsz_idx * o_accs_bsz_stride
++                                                + head_idx * o_accs_head_stride;
++                float * o_a_s_head = (float *)o_a_s + bsz_idx * o_a_s_bsz_stride
++                                                    + head_idx * o_a_s_head_stride;
++
++                const int* block_tables_ptr = (const int*)block_tables;
++                const int* block_table =
++                    block_tables_ptr + bsz_idx * block_table_stride_batch;
++
++                const int* context_lens_ptr = (const int*)context_lens;
++                const int context_length = context_lens_ptr[bsz_idx];
++
++                simd<IT, HD> query_row = block_load<IT, HD>(query_head) * attn_scale;
++
++                // copy k_cache to slm
++                int start_row = std::min(vid * VS + tid * sub_rows + std::min(tid, rem_rows), context_length);
++                int end_row = std::min(start_row + sub_rows + (tid < rem_rows), context_length);
++                for (int r = start_row; r < end_row; ++r) {
++                    int which_block = r / block_size;
++                    int which_slot = r % block_size;
++                    int physical_block_number = block_table[which_block];
++
++                    const IT * key_head = (const IT *)key + physical_block_number * kv_token_stride +
++                      kv_head_idx * kv_head_stride +
++                      which_slot * kv_block_stride;
++
++                    simd<IT, HD> key_row = block_load<IT, HD>(key_head);
++                    slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), key_row);
++                }
++                barrier();
++
++                simd<float, VS> attns = -sycl::detail::max_v<float>();
++                int row_num = (vid + 1) * VS > context_length ? context_length % VS : VS;
++                // q @ k
++                for (int r = 0; r < row_num; ++r) {
++                    simd<IT, HD> key_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
++                    float attn = sycl::ext::intel::esimd::detail::sum<float, IT, HD>(query_row * key_row);
++                    attns[r] = attn;
++                }
++
++                float max_attn = hmax<float, float, VS>(attns);
++                const simd<IT, VS> attn_exp = exp(attns - max_attn);
++                barrier();
++
++                // copy v_cache to slm
++                for (int r = start_row; r < end_row; ++r) {
++                    int which_block = r / block_size;
++                    int which_slot = r % block_size;
++                    int physical_block_number = block_table[which_block];
++
++                    const IT * value_head = (const IT *)value + physical_block_number * kv_token_stride +
++                      kv_head_idx * kv_head_stride +
++                      which_slot * kv_block_stride;
++
++                    simd<IT, HD> value_row = block_load<IT, HD>(value_head);
++                    slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), value_row);
++                }
++                barrier();
++
++                // attn @ v
++                simd<IT, HD> accs = 0;
++                for (int r = 0; r < row_num; ++r) {
++                    simd<IT, HD> value_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
++                    accs = accs + value_row * attn_exp[r];
++                }
++
++                float softmax = sycl::ext::intel::esimd::detail::sum<float, float, VS>(attn_exp);
++
++                block_store<IT, HD>(o_accs_head + vid * HD, accs);
++                block_store<float, 1>(o_a_s_head + vid * 2, max_attn);
++                block_store<float, 1>(o_a_s_head + vid * 2 + 1, softmax);
++            }
++        );
++    };
++
++    utils::submit_kernel(cgf, device, "gqa kernel 1/2");
++}
++
++template<typename IT, const int GS, const int HD>
++void gqa_2_kernel(
++    void * o_a_s,
++    void * o_accs,
++    void * output,
++    const void* context_lens, // [num_seqs]
++    const int64_t o_a_s_bsz_stride,
++    const int64_t o_a_s_head_stride,
++    const int64_t o_accs_bsz_stride,
++    const int64_t o_accs_head_stride,
++    const int64_t output_bsz_stride,
++    const int64_t output_head_stride,
++    const int bsz,
++    const int num_heads,
++    const int row_block_num,
++    const at::Device & device
++) {
++    constexpr int SUB_HD = 8;
++    static_assert(HD % SUB_HD == 0);
++    static_assert(HD / SUB_HD <= GS);
++
++    const int sub_rows = row_block_num / GS;
++    const int rem_rows = row_block_num % GS;
++
++    constexpr int accs_slm_offset = 0;
++    constexpr int attn_slm_offset = GS * HD * sizeof(float);
++    constexpr int softmax_slm_offset = attn_slm_offset + GS * sizeof(float);
++
++    sycl::range<3> global_size(bsz, num_heads, GS);
++    sycl::range<3> local_size(1, 1, GS);
++
++    auto cgf = [&](sycl::handler& handle) {
++        handle.parallel_for(
++            sycl::nd_range<3>(global_size, local_size),
++            [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++                slm_init<GS * HD * sizeof(float) + GS * 2 * sizeof(float)>();
++
++                const int bsz_idx = item.get_global_id(0);
++                const int head_idx = item.get_global_id(1);
++                const int tid = item.get_global_id(2);
++
++                const int* context_lens_ptr = (const int*)context_lens;
++                const int context_length = context_lens_ptr[bsz_idx];
++                constexpr int VS = 32;
++                const int cur_row_block_num = (context_length + VS - 1) / VS;
++                const int cur_sub_rows = cur_row_block_num / GS;
++                const int cur_rem_rows = cur_row_block_num % GS;
++
++                const float * o_a_s_head = (const float *)o_a_s + bsz_idx * o_a_s_bsz_stride
++                                                                + head_idx * o_a_s_head_stride;
++                const IT * o_accs_head = (const IT *)o_accs + bsz_idx * o_accs_bsz_stride
++                                                            + head_idx * o_accs_head_stride;
++                IT * output_head = (IT *)output + bsz_idx * output_bsz_stride
++                                                + head_idx * output_head_stride;
++
++                int start_row = std::min(tid * cur_sub_rows + std::min(tid, cur_rem_rows), cur_row_block_num);
++                int end_row = std::min(start_row + cur_sub_rows + (tid < cur_rem_rows), cur_row_block_num);
++
++                float max_attn = -sycl::detail::max_v<float>();
++                float softmax = 0;
++                simd<float, HD> accs = 0;
++                for (int r = start_row; r < end_row; ++r) {
++                    float sub_attn = o_a_s_head[2 * r];
++                    float sub_softmax = o_a_s_head[2 * r + 1];
++                    simd<float, HD> sub_accs = block_load<IT, HD>(o_accs_head + r * HD);
++                    float new_max_attn = std::max(max_attn, sub_attn);
++                    float exp1 = exp(max_attn - new_max_attn);
++                    float exp2 = exp(sub_attn - new_max_attn);
++                    accs = accs * exp1 + sub_accs * exp2;
++                    softmax = softmax * exp1 + sub_softmax * exp2;
++                    max_attn = new_max_attn;
++                }
++
++                slm_block_store<float, HD>(accs_slm_offset + tid * HD * sizeof(float), accs);
++                slm_block_store<float, 1>(attn_slm_offset + tid * sizeof(float), max_attn);
++                slm_block_store<float, 1>(softmax_slm_offset + tid * sizeof(float), softmax);
++                barrier();
++
++                if (tid < HD / SUB_HD) {
++                    simd<float, GS> max_attns = slm_block_load<float, GS>(attn_slm_offset);
++                    const simd<float, GS> scales = exp(max_attns - hmax<float, float, GS>(max_attns));
++                    simd<float, GS> softmaxs = slm_block_load<float, GS>(softmax_slm_offset);
++                    float softmax_sum = sycl::ext::intel::esimd::detail::sum<float, float, GS>(softmaxs * scales);
++
++                    simd<float, SUB_HD> result = 0;
++                    #pragma unroll
++                    for (int r = 0; r < GS; ++r) {
++                        simd<float, SUB_HD> sub_accs = slm_block_load<float, SUB_HD>(
++                            accs_slm_offset + (r * HD + tid * SUB_HD) * sizeof(float)
++                        );
++                        result = result + sub_accs * scales[r];
++                    }
++                    result = result / softmax_sum;
++                    block_store<IT, SUB_HD>(output_head + tid * SUB_HD, result);
++                }
++            }
++        );
++    };
++
++    utils::submit_kernel(cgf, device, "gqa kernel 2/2");
++}
++
++using AT = at::ScalarType;
++using fp16 = sycl::half;
++template<const int VS, const int GS, const int HD>
++auto dispatch_gqa_kernel(AT it) {
++    switch (it) {
++        case AT::Float: return std::make_tuple(gqa_1_kernel<float, VS, HD>, gqa_2_kernel<float, GS, HD>);
++        case AT::Half: return std::make_tuple(gqa_1_kernel<fp16, VS, HD>, gqa_2_kernel<fp16, GS, HD>);
++        default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported");
++    }
++}
++
++void paged_attention_gqa(
++    torch::Tensor output,
++    torch::Tensor query,
++    torch::Tensor key_cache,
++    torch::Tensor value_cache,
++    int64_t bsz,
++    int64_t num_heads,
++    int64_t num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int block_size,
++    int64_t head_dim,
++    int max_seq_len
++) {
++    constexpr int VS = 32;
++    constexpr int GS = 32;
++
++    const int row_block_num = (max_seq_len + VS - 1) / VS;
++    auto o_a_s = torch::empty({bsz, num_heads, 1, row_block_num * 2},
++                              torch::device(query.device()).dtype(torch::kFloat32));
++    auto o_accs = torch::empty({bsz, num_heads, 1, row_block_num * head_dim},
++                               torch::device(query.device()).dtype(query.dtype()));
++
++    auto [func1, func2] = [&](){
++        switch (head_dim) {
++            case 128: return dispatch_gqa_kernel<VS, GS, 128>(query.scalar_type());
++            case 96: return dispatch_gqa_kernel<VS, GS, 96>(query.scalar_type());
++            case 80: return dispatch_gqa_kernel<VS, GS, 80>(query.scalar_type());
++            case 64: return dispatch_gqa_kernel<VS, GS, 64>(query.scalar_type());
++            default: throw std::runtime_error("unsupported head_dim, only 128, 96, 80 and 64 are supported");
++        }
++    }();
++
++    func1(
++        query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
++        block_tables.data_ptr(), context_lens.data_ptr(), o_a_s.data_ptr(), o_accs.data_ptr(),
++        query.stride(0), query.stride(1), key_cache.stride(0), key_cache.stride(1), key_cache.stride(2), block_tables.stride(0),
++        o_a_s.stride(0), o_a_s.stride(1), o_accs.stride(0), o_accs.stride(1),
++        scale, block_size, bsz, num_heads, num_kv_heads, row_block_num,
++        query.device()
++    );
++
++    func2(
++        o_a_s.data_ptr(), o_accs.data_ptr(), output.data_ptr(), context_lens.data_ptr(),
++        o_a_s.stride(0), o_a_s.stride(1),
++        o_accs.stride(0), o_accs.stride(1),
++        output.stride(0), output.stride(1),
++        bsz, num_heads, row_block_num,
++        query.device()
++    );
++}
+diff --git a/csrc/xpu/attention_xpu_fp8.cpp b/csrc/xpu/attention_xpu_fp8.cpp
+new file mode 100644
+index 000000000..a2ea5819b
+--- /dev/null
++++ b/csrc/xpu/attention_xpu_fp8.cpp
+@@ -0,0 +1,324 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++#include <ext/intel/esimd.hpp>
++#include "kv.h"
++
++// clang-format on
++#include <float.h>
++#include <torch/extension.h>
++#include <stdexcept>
++#include "utils.h"
++#include "xpu_types.h"
++// #include "dtype_bfloat16.dp.hpp"
++#include "dtype_float16.h"
++#include "dtype_float32.h"
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++#include <c10/xpu/XPUStream.h>
++#endif
++
++#include <functional>
++// #include <ipex.h>
++
++using namespace sycl::ext::intel::esimd;
++using AT = at::ScalarType;
++
++template <typename IT, const int VS, const int HD>
++void gqa_1_kernel_fp8(
++    const void* query,  // [num_seqs, num_heads, head_size]
++    const void* key,    // [num_blocks, num_kv_heads, head_size, block_size]
++    const void* value,  // [num_blocks, num_kv_heads, head_size, block_size]
++    const void* block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const void* context_lens,  // [num_seqs]
++    void* o_a_s, void* o_accs, const int64_t query_bsz_stride,
++    const int64_t query_head_stride, const int64_t kv_token_stride,
++    const int64_t kv_head_stride, const int64_t kv_block_stride,
++    const int64_t block_table_stride_batch, const int64_t o_a_s_bsz_stride,
++    const int64_t o_a_s_head_stride, const int64_t o_accs_bsz_stride,
++    const int64_t o_accs_head_stride, const float scale, const int block_size,
++    const int bsz, const int num_heads, const int num_kv_heads,
++    const int block_num, const at::Device& device) {
++  const int group_size = num_heads / num_kv_heads;
++  const int sub_rows = VS / group_size;
++  const int rem_rows = VS % group_size;
++
++  const float attn_scale = scale;
++
++  sycl::range<3> global_size(bsz, num_heads, block_num);
++  sycl::range<3> local_size(1, group_size, 1);
++
++  auto cgf = [&](sycl::handler& handle) {
++    handle.parallel_for(
++        sycl::nd_range<3>(global_size, local_size),
++        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++          slm_init<VS * HD * sizeof(IT)>();
++
++          const int bsz_idx = item.get_global_id(0);
++          const int head_idx = item.get_global_id(1);
++          const int kv_head_idx = item.get_group(1);
++          const int tid = item.get_local_id(1);
++          const int vid = item.get_global_id(2);
++
++          const IT* query_head = (const IT*)query + bsz_idx * query_bsz_stride +
++                                 head_idx * query_head_stride;
++
++          IT* o_accs_head = (IT*)o_accs + bsz_idx * o_accs_bsz_stride +
++                            head_idx * o_accs_head_stride;
++          float* o_a_s_head = (float*)o_a_s + bsz_idx * o_a_s_bsz_stride +
++                              head_idx * o_a_s_head_stride;
++
++          const int* block_tables_ptr = (const int*)block_tables;
++          const int* block_table =
++              block_tables_ptr + bsz_idx * block_table_stride_batch;
++
++          const int* context_lens_ptr = (const int*)context_lens;
++          const int context_length = context_lens_ptr[bsz_idx];
++
++          simd<IT, HD> query_row = block_load<IT, HD>(query_head) * attn_scale;
++
++          // copy k_cache to slm
++          int start_row =
++              std::min(vid * VS + tid * sub_rows + std::min(tid, rem_rows),
++                       context_length);
++          int end_row =
++              std::min(start_row + sub_rows + (tid < rem_rows), context_length);
++          for (int r = start_row; r < end_row; ++r) {
++            int which_block = r / block_size;
++            int which_slot = r % block_size;
++            int physical_block_number = block_table[which_block];
++
++            // Load elements in uint8_t
++            const uint8_t* key_head =
++                (const uint8_t*)key + physical_block_number * kv_token_stride +
++                kv_head_idx * kv_head_stride + which_slot * kv_block_stride;
++
++            simd<uint8_t, HD> key_row = block_load<uint8_t, HD>(key_head);
++            simd<IT, HD> key_dequantized = dequantize_key_row<HD>(key_row);
++            slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT), key_dequantized);
++          }
++          barrier();
++
++          simd<float, VS> attns = -sycl::detail::max_v<float>();
++          int row_num =
++              (vid + 1) * VS > context_length ? context_length % VS : VS;
++          // q @ k
++          for (int r = 0; r < row_num; ++r) {
++            simd<IT, HD> key_row = slm_block_load<IT, HD>(r * HD * sizeof(IT));
++            float attn = sycl::ext::intel::esimd::detail::sum<float, IT, HD>(
++                query_row * key_row);
++            attns[r] = attn;
++          }
++
++          float max_attn = hmax<float, float, VS>(attns);
++          const simd<IT, VS> attn_exp = exp(attns - max_attn);
++          barrier();
++
++          // copy v_cache to slm
++          for (int r = start_row; r < end_row; ++r) {
++            int which_block = r / block_size;
++            int which_slot = r % block_size;
++            int physical_block_number = block_table[which_block];
++
++            const uint8_t* value_head =
++                (const uint8_t*)value + physical_block_number * kv_token_stride +
++                kv_head_idx * kv_head_stride + which_slot * kv_block_stride;
++
++            simd<uint8_t, HD> value_row = block_load<uint8_t, HD>(value_head);
++            simd<IT, HD> value_dequantized = dequantize_value_row<HD>(value_row);
++            slm_block_store<IT, HD>((r - vid * VS) * HD * sizeof(IT),
++                                    value_dequantized);
++          }
++          barrier();
++
++          // attn @ v
++          simd<IT, HD> accs = 0;
++          for (int r = 0; r < row_num; ++r) {
++            simd<IT, HD> value_row =
++                slm_block_load<IT, HD>(r * HD * sizeof(IT));
++            accs = accs + value_row * attn_exp[r];
++          }
++
++          float softmax =
++              sycl::ext::intel::esimd::detail::sum<float, float, VS>(attn_exp);
++
++          block_store<IT, HD>(o_accs_head + vid * HD, accs);
++          block_store<float, 1>(o_a_s_head + vid * 2, max_attn);
++          block_store<float, 1>(o_a_s_head + vid * 2 + 1, softmax);
++        });
++  };
++
++  utils::submit_kernel(cgf, device, "gqa kernel 1/2");
++}
++
++template <typename IT, const int GS, const int HD>
++void gqa_2_kernel_fp8(void* o_a_s, void* o_accs, void* output,
++                  const void* context_lens,  // [num_seqs]
++                  const int64_t o_a_s_bsz_stride,
++                  const int64_t o_a_s_head_stride,
++                  const int64_t o_accs_bsz_stride,
++                  const int64_t o_accs_head_stride,
++                  const int64_t output_bsz_stride,
++                  const int64_t output_head_stride, const int bsz,
++                  const int num_heads, const int row_block_num,
++                  const at::Device& device) {
++  constexpr int SUB_HD = 8;
++  static_assert(HD % SUB_HD == 0);
++  static_assert(HD / SUB_HD <= GS);
++
++  const int sub_rows = row_block_num / GS;
++  const int rem_rows = row_block_num % GS;
++
++  constexpr int accs_slm_offset = 0;
++  constexpr int attn_slm_offset = GS * HD * sizeof(float);
++  constexpr int softmax_slm_offset = attn_slm_offset + GS * sizeof(float);
++
++  sycl::range<3> global_size(bsz, num_heads, GS);
++  sycl::range<3> local_size(1, 1, GS);
++
++  auto cgf = [&](sycl::handler& handle) {
++    handle.parallel_for(
++        sycl::nd_range<3>(global_size, local_size),
++        [=](sycl::nd_item<3> item) SYCL_ESIMD_KERNEL {
++          slm_init<GS * HD * sizeof(float) + GS * 2 * sizeof(float)>();
++
++          const int bsz_idx = item.get_global_id(0);
++          const int head_idx = item.get_global_id(1);
++          const int tid = item.get_global_id(2);
++
++          const int* context_lens_ptr = (const int*)context_lens;
++          const int context_length = context_lens_ptr[bsz_idx];
++          constexpr int VS = 32;
++          const int cur_row_block_num = (context_length + VS - 1) / VS;
++          const int cur_sub_rows = cur_row_block_num / GS;
++          const int cur_rem_rows = cur_row_block_num % GS;
++
++          const float* o_a_s_head = (const float*)o_a_s +
++                                    bsz_idx * o_a_s_bsz_stride +
++                                    head_idx * o_a_s_head_stride;
++          const IT* o_accs_head = (const IT*)o_accs +
++                                  bsz_idx * o_accs_bsz_stride +
++                                  head_idx * o_accs_head_stride;
++          IT* output_head = (IT*)output + bsz_idx * output_bsz_stride +
++                            head_idx * output_head_stride;
++
++          int start_row =
++              std::min(tid * cur_sub_rows + std::min(tid, cur_rem_rows),
++                       cur_row_block_num);
++          int end_row =
++              std::min(start_row + cur_sub_rows + (tid < cur_rem_rows),
++                       cur_row_block_num);
++
++          float max_attn = -sycl::detail::max_v<float>();
++          float softmax = 0;
++          simd<float, HD> accs = 0;
++          for (int r = start_row; r < end_row; ++r) {
++            float sub_attn = o_a_s_head[2 * r];
++            float sub_softmax = o_a_s_head[2 * r + 1];
++            simd<float, HD> sub_accs = block_load<IT, HD>(o_accs_head + r * HD);
++            float new_max_attn = std::max(max_attn, sub_attn);
++            float exp1 = exp(max_attn - new_max_attn);
++            float exp2 = exp(sub_attn - new_max_attn);
++            accs = accs * exp1 + sub_accs * exp2;
++            softmax = softmax * exp1 + sub_softmax * exp2;
++            max_attn = new_max_attn;
++          }
++
++          slm_block_store<float, HD>(accs_slm_offset + tid * HD * sizeof(float),
++                                     accs);
++          slm_block_store<float, 1>(attn_slm_offset + tid * sizeof(float),
++                                    max_attn);
++          slm_block_store<float, 1>(softmax_slm_offset + tid * sizeof(float),
++                                    softmax);
++          barrier();
++
++          if (tid < HD / SUB_HD) {
++            simd<float, GS> max_attns =
++                slm_block_load<float, GS>(attn_slm_offset);
++            const simd<float, GS> scales =
++                exp(max_attns - hmax<float, float, GS>(max_attns));
++            simd<float, GS> softmaxs =
++                slm_block_load<float, GS>(softmax_slm_offset);
++            float softmax_sum =
++                sycl::ext::intel::esimd::detail::sum<float, float, GS>(
++                    softmaxs * scales);
++
++            simd<float, SUB_HD> result = 0;
++#pragma unroll
++            for (int r = 0; r < GS; ++r) {
++              simd<float, SUB_HD> sub_accs = slm_block_load<float, SUB_HD>(
++                  accs_slm_offset + (r * HD + tid * SUB_HD) * sizeof(float));
++              result = result + sub_accs * scales[r];
++            }
++            result = result / softmax_sum;
++            block_store<IT, SUB_HD>(output_head + tid * SUB_HD, result);
++          }
++        });
++  };
++
++  utils::submit_kernel(cgf, device, "gqa kernel 2/2");
++}
++
++template <const int VS, const int GS, const int HD>
++auto dispatch_gqa_kernel_fp8(AT it) {
++  switch (it) {
++    case AT::Float:
++      return std::make_tuple(gqa_1_kernel_fp8<float, VS, HD>,
++                             gqa_2_kernel_fp8<float, GS, HD>);
++    case AT::Half:
++      return std::make_tuple(gqa_1_kernel_fp8<fp16, VS, HD>,
++                             gqa_2_kernel_fp8<fp16, GS, HD>);
++    default:
++      throw std::runtime_error(
++          "unsupported dtype, only fp32 and fp16 are supported");
++  }
++}
++
++void paged_attention_gqa_fp8(torch::Tensor output, torch::Tensor query,
++                         torch::Tensor key_cache, torch::Tensor value_cache,
++                         int64_t bsz, int64_t num_heads, int64_t num_kv_heads,
++                         float scale, torch::Tensor& block_tables,
++                         torch::Tensor& context_lens, int block_size,
++                         int64_t head_dim, int max_seq_len) {
++  constexpr int VS = 32;
++  constexpr int GS = 32;
++
++  const int row_block_num = (max_seq_len + VS - 1) / VS;
++  auto o_a_s =
++      torch::empty({bsz, num_heads, 1, row_block_num * 2},
++                   torch::device(query.device()).dtype(torch::kFloat32));
++  auto o_accs =
++      torch::empty({bsz, num_heads, 1, row_block_num * head_dim},
++                   torch::device(query.device()).dtype(query.dtype()));
++
++  auto [func1, func2] = [&]() {
++    switch (head_dim) {
++      case 128:
++        return dispatch_gqa_kernel_fp8<VS, GS, 128>(query.scalar_type());
++      case 96:
++        return dispatch_gqa_kernel_fp8<VS, GS, 96>(query.scalar_type());
++      case 80:
++        return dispatch_gqa_kernel_fp8<VS, GS, 80>(query.scalar_type());
++      case 64:
++        return dispatch_gqa_kernel_fp8<VS, GS, 64>(query.scalar_type());
++      default:
++        throw std::runtime_error(
++            "unsupported head_dim, only 128, 96, 80 and 64 are supported");
++    }
++  }();
++
++  func1(query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
++        block_tables.data_ptr(), context_lens.data_ptr(), o_a_s.data_ptr(),
++        o_accs.data_ptr(), query.stride(0), query.stride(1),
++        key_cache.stride(0), key_cache.stride(1), key_cache.stride(2),
++        block_tables.stride(0), o_a_s.stride(0), o_a_s.stride(1),
++        o_accs.stride(0), o_accs.stride(1), scale, block_size, bsz, num_heads,
++        num_kv_heads, row_block_num, query.device());
++
++  func2(o_a_s.data_ptr(), o_accs.data_ptr(), output.data_ptr(),
++        context_lens.data_ptr(), o_a_s.stride(0), o_a_s.stride(1),
++        o_accs.stride(0), o_accs.stride(1), output.stride(0), output.stride(1),
++        bsz, num_heads, row_block_num, query.device());
++}
+diff --git a/csrc/xpu/base.hpp b/csrc/xpu/base.hpp
+new file mode 100644
+index 000000000..c364c62e6
+--- /dev/null
++++ b/csrc/xpu/base.hpp
+@@ -0,0 +1,118 @@
++#pragma once
++
++#include <sycl.hpp>
++#include <sycl/ext/intel/esimd.hpp>
++
++#include "common.h"
++
++using namespace sycl::ext::intel::esimd;
++using fp16 = sycl::half;
++
++constexpr int QK = 64;
++constexpr int SBS = 4;
++
++constexpr int BLOCK_SIZES[GGML_TYPE_COUNT] = {
++    [GGML_TYPE_Q4_0]     = QK / 2,
++    [GGML_TYPE_Q4_0_WOQ] = QK / 2,
++    [GGML_TYPE_FP8E5]  = QK,
++};
++
++constexpr int SCALE_SIZES[GGML_TYPE_COUNT] = {
++    [GGML_TYPE_Q4_0]     = sizeof(fp16),
++    [GGML_TYPE_Q4_0_WOQ] = sizeof(fp16),
++    [GGML_TYPE_FP8E5]  = 0,
++};
++
++template<int QTYPE>
++ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale);
++
++template<>
++ESIMD_INLINE auto load_qblocks<GGML_TYPE_Q4_0>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0];
++    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
++    const simd<fp16, SBS> scales = block_load<fp16, SBS>((const fp16 *)scale);
++
++    simd<fp16, QK * SBS> yvs;
++    #pragma unroll
++    for (int i = 0; i < SBS; ++i) {
++        simd<uint8_t, QK> uyv;
++        uyv.select<QK / 2, 1>(0) = ybytes.template select<QK / 2, 1>(i * QK / 2) & (uint8_t)0xF;
++        uyv.select<QK / 2, 1>(QK / 2) = ybytes.template select<QK / 2, 1>(i * QK / 2) >> (uint8_t)4;
++        yvs.template select<QK, 1>(i * QK) = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales[i];
++    }
++    return yvs;
++}
++
++template<>
++ESIMD_INLINE auto load_qblocks<GGML_TYPE_Q4_0_WOQ>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ];
++    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
++    const simd<fp16, SBS> scales = block_load<fp16, SBS>((const fp16 *)scale);
++
++    simd<fp16, QK * SBS> yvs;
++    #pragma unroll
++    for (int i = 0; i < SBS; ++i) {
++        simd<uint8_t, QK> uyv;
++        uyv.select<QK / 2, 2>(0) = ybytes.template select<QK / 2, 1>(i * QK / 2) & (uint8_t)0xF;
++        uyv.select<QK / 2, 2>(1) = ybytes.template select<QK / 2, 1>(i * QK / 2) >> (uint8_t)4;
++        yvs.template select<QK, 1>(i * QK) = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales[i];
++    }
++    return yvs;
++}
++
++
++template<>
++ESIMD_INLINE auto load_qblocks<GGML_TYPE_FP8E5>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5];
++    simd<uint8_t, BLOCK_SIZE * SBS> ybytes = block_load<uint8_t, BLOCK_SIZE * SBS>(weight);
++
++    simd<fp16, QK * SBS> yvs;
++    yvs.template bit_cast_view<uint8_t>().template select<QK * SBS, 2>(0) = 0x80;
++    yvs.template bit_cast_view<uint8_t>().template select<QK * SBS, 2>(1) = ybytes;
++    return yvs;
++}
++
++
++// C++ doesn't support function template partial specialization, so write a new version for SBS=1
++template<int QTYPE>
++ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale);
++
++template<>
++ESIMD_INLINE auto load_qblock<GGML_TYPE_Q4_0>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0];
++    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
++    fp16 scales = *(const fp16 *)scale;
++
++    simd<uint8_t, QK> uyv;
++    uyv.select<QK / 2, 1>(0) = ybytes & (uint8_t)0xF;
++    uyv.select<QK / 2, 1>(QK / 2) = ybytes >> (uint8_t)4;
++    simd<fp16, QK> yv = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales;
++
++    return yv;
++}
++
++template<>
++ESIMD_INLINE auto load_qblock<GGML_TYPE_Q4_0_WOQ>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ];
++    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
++    fp16 scales = *(const fp16 *)scale;
++
++    simd<uint8_t, QK> uyv;
++    uyv.select<QK / 2, 2>(0) = ybytes & (uint8_t)0xF;
++    uyv.select<QK / 2, 2>(1) = ybytes >> (uint8_t)4;
++    simd<fp16, QK> yv = (uyv.bit_cast_view<int8_t>() - (int8_t)8) * scales;
++
++    return yv;
++}
++
++
++template<>
++ESIMD_INLINE auto load_qblock<GGML_TYPE_FP8E5>(const uint8_t * weight, const uint8_t * scale) {
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5];
++    simd<uint8_t, BLOCK_SIZE> ybytes = block_load<uint8_t, BLOCK_SIZE>(weight);
++
++    simd<fp16, QK> yvs;
++    yvs.template bit_cast_view<uint8_t>().template select<QK, 2>(0) = 0x80;
++    yvs.template bit_cast_view<uint8_t>().template select<QK, 2>(1) = ybytes;
++    return yvs;
++}
+diff --git a/csrc/xpu/cache_ops_xpu.cpp b/csrc/xpu/cache_ops_xpu.cpp
+new file mode 100644
+index 000000000..a3451c0e7
+--- /dev/null
++++ b/csrc/xpu/cache_ops_xpu.cpp
+@@ -0,0 +1,579 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++#include <ext/intel/esimd.hpp>
++// clang-format on
++#include "xpu_types.h"
++
++#include <torch/extension.h>
++#include "utils.h"
++
++using fp16 = sycl::half;
++using namespace sycl::ext::intel::esimd;
++
++template <typename scalar_t>
++void reshape_and_cache_kernel(
++    const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
++    const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x,
++                                      // block_size, x]
++    scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size,
++                                        // block_size]
++    const int64_t* __restrict__ slot_mapping, // [num_tokens]
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  const int64_t slot_idx = slot_mapping[token_idx];
++  if (slot_idx < 0) {
++    // Padding token that should be ignored.
++    return;
++  }
++
++  const int64_t block_idx = slot_idx / block_size;
++  const int64_t block_offset = slot_idx % block_size;
++
++  const int n = num_heads * head_size;
++  for (int i = item_ct1.get_local_id(2); i < n;
++       i += item_ct1.get_local_range(2)) {
++    const int64_t src_key_idx = token_idx * key_stride + i;
++    const int64_t src_value_idx = token_idx * value_stride + i;
++
++    const int head_idx = i / head_size;
++    const int head_offset = i % head_size;
++    const int x_idx = head_offset / x;
++    const int x_offset = head_offset % x;
++
++    const int64_t tgt_key_idx =
++        block_idx * num_heads * (head_size / x) * block_size * x +
++        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
++        block_offset * x + x_offset;
++    const int64_t tgt_value_idx =
++        block_idx * num_heads * head_size * block_size +
++        head_idx * head_size * block_size + head_offset * block_size +
++        block_offset;
++    key_cache[tgt_key_idx] = key[src_key_idx];
++    value_cache[tgt_value_idx] = value[src_value_idx];
++  }
++}
++
++template <typename scalar_t>
++void call_reshape_and_cache_kernel(
++    const scalar_t* __restrict__ key,
++    const scalar_t* __restrict__ value,
++    scalar_t* __restrict__ key_cache,
++    scalar_t* __restrict__ value_cache,
++    const int64_t* __restrict__ slot_mapping,
++    const int num_tokens,
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          reshape_and_cache_kernel<sycl_t>(
++              (const sycl_t* __restrict__)key,
++              (const sycl_t* __restrict__)value,
++              (sycl_t* __restrict__)key_cache,
++              (sycl_t* __restrict__)value_cache,
++              slot_mapping,
++              key_stride,
++              value_stride,
++              num_heads,
++              head_size,
++              block_size,
++              x,
++              item_ct1);
++        });
++  });
++}
++
++void reshape_and_cache(
++    torch::Tensor& key,
++    torch::Tensor& value,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    torch::Tensor& slot_mapping,
++    const std::string& kv_cache_dtype,
++    const float kv_scale) {
++  int num_tokens = key.size(0);
++  int num_heads = key.size(1);
++  int head_size = key.size(2);
++  int block_size = key_cache.size(3);
++  int x = key_cache.size(4);
++
++  int key_stride = key.stride(0);
++  int value_stride = value.stride(0);
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      key.scalar_type(), "call_reshape_and_cache_kernel", [&] {
++        call_reshape_and_cache_kernel<scalar_t>(
++            key.data_ptr<scalar_t>(),
++            value.data_ptr<scalar_t>(),
++            key_cache.data_ptr<scalar_t>(),
++            value_cache.data_ptr<scalar_t>(),
++            slot_mapping.data_ptr<int64_t>(),
++            num_tokens,
++            key_stride,
++            value_stride,
++            num_heads,
++            head_size,
++            block_size,
++            x);
++      });
++}
++
++template <typename scalar_t>
++void reshape_and_cache_ipexllm_kernel(
++    const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
++    const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
++    scalar_t* __restrict__ value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
++    const int64_t* __restrict__ slot_mapping, // [num_tokens]
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  const int64_t slot_idx = slot_mapping[token_idx];
++  if (slot_idx < 0) {
++    // Padding token that should be ignored.
++    return;
++  }
++
++  const int64_t block_idx = slot_idx / block_size;
++  const int64_t block_offset = slot_idx % block_size;
++
++  const int n = num_heads * head_size;
++  for (int i = item_ct1.get_local_id(2); i < n;
++       i += item_ct1.get_local_range(2)) {
++    const int64_t src_key_idx = token_idx * key_stride + i;
++    const int64_t src_value_idx = token_idx * value_stride + i;
++
++    const int head_idx = i / head_size;
++    const int head_offset = i % head_size;
++
++    // const int64_t tgt_key_idx =
++    //     block_idx * num_heads * (head_size / x) * block_size * x +
++    //     head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
++    //     block_offset * x + x_offset;
++
++    // const int64_t tgt_value_idx =
++    //     block_idx * num_heads * head_size * block_size +
++    //     head_idx * head_size * block_size + head_offset * block_size +
++    //     block_offset;
++
++    const int64_t tgt_value_idx =
++        block_idx * num_heads * head_size * block_size +
++        head_idx * head_size * block_size + 
++        block_offset * head_size + 
++        head_offset;
++    const int64_t tgt_key_idx = tgt_value_idx;
++    key_cache[tgt_key_idx] = key[src_key_idx];
++    value_cache[tgt_value_idx] = value[src_value_idx];
++  }
++}
++
++template <typename scalar_t>
++void call_reshape_and_cache_ipexllm_kernel(
++    const scalar_t* __restrict__ key,
++    const scalar_t* __restrict__ value,
++    scalar_t* __restrict__ key_cache,
++    scalar_t* __restrict__ value_cache,
++    const int64_t* __restrict__ slot_mapping,
++    const int num_tokens,
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          reshape_and_cache_ipexllm_kernel<sycl_t>(
++              (const sycl_t* __restrict__)key,
++              (const sycl_t* __restrict__)value,
++              (sycl_t* __restrict__)key_cache,
++              (sycl_t* __restrict__)value_cache,
++              slot_mapping,
++              key_stride,
++              value_stride,
++              num_heads,
++              head_size,
++              block_size,
++              x,
++              item_ct1);
++        });
++  });
++}
++
++void reshape_and_cache_ipexllm(
++    torch::Tensor& key,
++    torch::Tensor& value,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    torch::Tensor& slot_mapping,
++    const std::string& kv_cache_dtype,
++    const float kv_scale) {
++  int num_tokens = key.size(0);
++  int num_heads = key.size(1);
++  int head_size = key.size(2);
++  int block_size = key_cache.size(2);
++  // int x = key_cache.size(4);
++  int x = 1;
++
++  int key_stride = key.stride(0);
++  int value_stride = value.stride(0);
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel", [&] {
++        call_reshape_and_cache_ipexllm_kernel<scalar_t>(
++            key.data_ptr<scalar_t>(),
++            value.data_ptr<scalar_t>(),
++            key_cache.data_ptr<scalar_t>(),
++            value_cache.data_ptr<scalar_t>(),
++            slot_mapping.data_ptr<int64_t>(),
++            num_tokens,
++            key_stride,
++            value_stride,
++            num_heads,
++            head_size,
++            block_size,
++            x);
++      });
++}
++
++
++template <typename scalar_t>
++void copy_blocks_kernel(
++    int64_t* key_cache_ptrs,
++    int64_t* value_cache_ptrs,
++    const int64_t* __restrict__ block_mapping,
++    const int numel_per_block,
++    const sycl::nd_item<3>& item_ct1) {
++  const int layer_idx = item_ct1.get_group(2);
++  const int pair_idx = item_ct1.get_group(1);
++
++  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
++  scalar_t* value_cache =
++      reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
++  int64_t src_block_number = block_mapping[2 * pair_idx];
++  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
++
++  const int64_t src_block_offset = src_block_number * numel_per_block;
++  const int64_t dst_block_offset = dst_block_number * numel_per_block;
++  for (int i = item_ct1.get_local_id(2); i < numel_per_block;
++       i += item_ct1.get_local_range(2)) {
++    int64_t src_offset = src_block_offset + i;
++    int64_t dst_offset = dst_block_offset + i;
++    key_cache[dst_offset] = key_cache[src_offset];
++  }
++  for (int i = item_ct1.get_local_id(2); i < numel_per_block;
++       i += item_ct1.get_local_range(2)) {
++    int64_t src_offset = src_block_offset + i;
++    int64_t dst_offset = dst_block_offset + i;
++    value_cache[dst_offset] = value_cache[src_offset];
++  }
++}
++
++template <typename scalar_t>
++void call_copy_blocks_kernel(
++    std::vector<torch::Tensor>& key_caches,
++    std::vector<torch::Tensor>& value_caches,
++    const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int num_layers = key_caches.size();
++  TORCH_CHECK(num_layers == value_caches.size());
++  if (num_layers == 0) {
++    return;
++  }
++  torch::Device cache_device = key_caches[0].device();
++  TORCH_CHECK(cache_device.is_xpu());
++  // Create data structures for the kernel.
++  // Create an array of pointers to the key and value caches.
++  int64_t key_cache_ptrs[num_layers];
++  int64_t value_cache_ptrs[num_layers];
++  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
++    key_cache_ptrs[layer_idx] =
++        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
++    value_cache_ptrs[layer_idx] =
++        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
++  }
++  // Create block mapping array.
++  std::vector<int64_t> block_mapping_vec;
++  for (const auto& pair : block_mapping) {
++    int64_t src_block_number = pair.first;
++    for (int64_t dst_block_number : pair.second) {
++      block_mapping_vec.push_back(src_block_number);
++      block_mapping_vec.push_back(dst_block_number);
++    }
++  }
++  int64_t* block_mapping_array = block_mapping_vec.data();
++  int num_pairs = block_mapping_vec.size() / 2;
++  // Move the data structures to the GPU.
++  // NOTE: This synchronizes the CPU and GPU.
++  torch::Tensor key_cache_ptrs_tensor =
++      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
++          .to(cache_device);
++  torch::Tensor value_cache_ptrs_tensor =
++      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
++          .to(cache_device);
++  torch::Tensor block_mapping_tensor =
++      torch::from_blob(block_mapping_array, {2 * num_pairs}, torch::kInt64)
++          .to(cache_device);
++  auto k_ptr = key_cache_ptrs_tensor.data_ptr<int64_t>();
++  auto v_ptr = value_cache_ptrs_tensor.data_ptr<int64_t>();
++  auto b_ptr = block_mapping_tensor.data_ptr<int64_t>();
++  // Launch the kernel.
++  const int numel_per_block = key_caches[0][0].numel();
++
++  sycl::range<3> grid(1, num_pairs, num_layers);
++  sycl::range<3> block(1, 1, std::min(1024, numel_per_block));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          copy_blocks_kernel<sycl_t>(
++              k_ptr, v_ptr, b_ptr, numel_per_block, item_ct1);
++        });
++  });
++}
++
++void copy_blocks(
++    std::vector<torch::Tensor>& key_caches,
++    std::vector<torch::Tensor>& value_caches,
++    const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      key_caches[0].scalar_type(), "call_copy_blocks_kernel", [&] {
++        call_copy_blocks_kernel<scalar_t>(
++            key_caches, value_caches, block_mapping);
++      });
++}
++
++void swap_blocks(
++    torch::Tensor& src,
++    torch::Tensor& dst,
++    const std::map<int64_t, int64_t>& block_mapping) {
++  char* src_ptr = (char*)src.data_ptr();
++  char* dst_ptr = (char*)dst.data_ptr();
++
++  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
++  auto& queue = vllm::xpu::vllmGetQueue();
++
++  // NOTE(woosuk): This can be slow if the number of blocks is large.
++  for (const auto& pair : block_mapping) {
++    int64_t src_block_number = pair.first;
++    int64_t dst_block_number = pair.second;
++    int64_t src_offset = src_block_number * block_size_in_bytes;
++    int64_t dst_offset = dst_block_number * block_size_in_bytes;
++    queue.memcpy(
++        dst_ptr + dst_offset, src_ptr + src_offset, block_size_in_bytes);
++  }
++  queue.wait();
++}
++
++template <typename scalar_t>
++void gather_cached_kv_kernel(
++    scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size]
++    scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads,
++                                  // head_size]
++    const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads,
++                                            // head_size/x, block_size, x]
++    const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads,
++                                              // head_size, block_size]
++    const int* __restrict__ slot_mapping, // [num_tokens]
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x,
++    const sycl::nd_item<3>& item_ct1) {
++  const int token_idx = item_ct1.get_group(2);
++  const int slot_idx = slot_mapping[token_idx];
++  const int block_idx = slot_idx / block_size;
++  const int block_offset = slot_idx % block_size;
++
++  const int num_tokens = num_heads * head_size;
++  for (int i = item_ct1.get_local_id(2); i < num_tokens;
++       i += item_ct1.get_local_range(2)) {
++    const int tgt_key_idx = token_idx * key_stride + i;
++    const int tgt_value_idx = token_idx * value_stride + i;
++
++    const int head_idx = i / head_size;
++    const int head_offset = i % head_size;
++    const int x_idx =
++        head_offset / x; // the offset of the [head_size/x] dimension
++    const int x_offset = head_offset % x;
++
++    // const int src_key_idx =
++    //     block_idx * num_heads * (head_size / x) * block_size * x +
++    //     head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
++    //     block_offset * x + x_offset;
++    // const int src_value_idx = block_idx * num_heads * head_size * block_size +
++    //     head_idx * head_size * block_size + head_offset * block_size +
++    //     block_offset;
++
++    const int src_value_idx = 
++        block_idx * num_heads * head_size * block_size + 
++        head_idx * head_size * block_size + 
++        block_offset * head_size + 
++        head_offset;
++    const int src_key_idx = src_value_idx;
++
++    key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]);
++    value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]);
++  }
++}
++
++template <typename scalar_t>
++void gather_cached_kv_kernel_optimized(
++    scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size]
++    scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads,
++                                  // head_size]
++    const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads,
++                                            // head_size/x, block_size, x]
++    const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads,
++                                              // head_size, block_size]
++    const int* __restrict__ slot_mapping, // [num_tokens]
++    const int key_stride,
++    const int value_stride,
++    const int num_heads,
++    const int head_size,
++    const int block_size,
++    const int x,
++    const sycl::nd_item<3>& item_ct1) {
++  const int token_idx = item_ct1.get_group(2);
++  const int slot_idx = slot_mapping[token_idx];
++  const int block_idx = slot_idx / block_size;
++  const int block_offset = slot_idx % block_size;
++
++  const int dim = num_heads * head_size;
++  assert(dim % 4 == 0); // this is true for known use cases
++  const int unroll_factor = 4;
++  const int unrolled_dim = dim / unroll_factor;
++
++  for (int i = item_ct1.get_local_id(2); i < unrolled_dim;
++       i += item_ct1.get_local_range(2)) {
++    int tgt_key_indices[unroll_factor];
++    int tgt_value_indices[unroll_factor];
++    int src_key_indices[unroll_factor];
++    int src_value_indices[unroll_factor];
++    scalar_t keys_to_store[unroll_factor];
++    scalar_t values_to_store[unroll_factor];
++
++#pragma unroll
++    for (int j = 0; j < unroll_factor; ++j) {
++      int index = i + j * unrolled_dim;
++
++      const int tgt_key_idx = token_idx * key_stride + index;
++      const int tgt_value_idx = token_idx * value_stride + index;
++
++      const int head_idx = index / head_size;
++      const int head_offset = index % head_size;
++
++      const int src_value_idx = 
++        block_idx * num_heads * head_size * block_size + 
++        head_idx * head_size * block_size + 
++        block_offset * head_size + 
++        head_offset;
++      const int src_key_idx = src_value_idx;
++
++      tgt_key_indices[j] = tgt_key_idx;
++      tgt_value_indices[j] = tgt_value_idx;
++      src_key_indices[j] = src_key_idx;
++      src_value_indices[j] = src_value_idx;
++
++      keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]);
++      values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]);
++    }
++
++#pragma unroll
++    for (int j = 0; j < unroll_factor; ++j) {
++      key[tgt_key_indices[j]] = keys_to_store[j];
++      value[tgt_value_indices[j]] = values_to_store[j];
++    }
++  }
++}
++
++template <typename scalar_t>
++void call_gather_cached_kv_kernel_optimized(
++    torch::Tensor& key,
++    torch::Tensor& value,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    torch::Tensor& slot_mapping) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int num_tokens = key.size(0);
++  int num_heads = key.size(1);
++  int head_size = key.size(2);
++  int block_size = key_cache.size(2);
++  // int x = key_cache.size(4);
++  int x = 1;
++
++  int key_stride = key.stride(0);
++  int value_stride = value.stride(0);
++  auto key_ptr = key.data_ptr<scalar_t>();
++  auto value_ptr = value.data_ptr<scalar_t>();
++  auto key_cache_ptr = key_cache.data_ptr<scalar_t>();
++  auto value_cache_ptr = value_cache.data_ptr<scalar_t>();
++  auto slot_mapping_ptr = slot_mapping.data_ptr<int>();
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(num_heads * head_size, 512));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          gather_cached_kv_kernel_optimized<sycl_t>(
++              (sycl_t* __restrict__)key_ptr,
++              (sycl_t* __restrict__)value_ptr,
++              (const sycl_t* __restrict__)key_cache_ptr,
++              (const sycl_t* __restrict__)value_cache_ptr,
++              slot_mapping_ptr,
++              key_stride,
++              value_stride,
++              num_heads,
++              head_size,
++              block_size,
++              x,
++              item_ct1);
++        });
++  });
++}
++
++void gather_cached_kv(
++    torch::Tensor& key,
++    torch::Tensor& value,
++    torch::Tensor& key_cache,
++    torch::Tensor& value_cache,
++    torch::Tensor& slot_mapping) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      key_cache[0].scalar_type(),
++      "call_gather_cached_kv_kernel_optimized",
++      [&] {
++        call_gather_cached_kv_kernel_optimized<scalar_t>(
++            key, value, key_cache, value_cache, slot_mapping);
++      });
++}
+diff --git a/csrc/xpu/cache_ops_xpu_fp8.cpp b/csrc/xpu/cache_ops_xpu_fp8.cpp
+new file mode 100644
+index 000000000..e4a0001fe
+--- /dev/null
++++ b/csrc/xpu/cache_ops_xpu_fp8.cpp
+@@ -0,0 +1,170 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++#include <ext/intel/esimd.hpp>
++// clang-format on
++#include "xpu_types.h"
++
++#include <torch/extension.h>
++#include "utils.h"
++#include "kv.h"
++
++using fp16 = sycl::half;
++using namespace sycl::ext::intel::esimd;
++
++// scalar_t is key.scalar_type() -> half
++template <typename scalar_t, const int HD>
++void reshape_and_cache_ipexllm_kernel_fp8(
++    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
++    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
++    uint8_t * __restrict__ key_cache,  // [num_blocks, num_kv_heads, block_size,
++                                       // head_size]
++    uint8_t * __restrict__ value_cache,        // [num_blocks, num_kv_heads,
++                                               // block_size, head_size]
++    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
++    const int key_stride, const int value_stride,
++    const int key_head_stride, const int value_head_stride,
++    const int num_heads,
++    const int head_size, const int block_size, const int x,
++    const sycl::nd_item<3>& item_ct1) {
++
++  //                      New Implementation                      //
++  const size_t token_idx = item_ct1.get_global_id(0);
++  const size_t head_idx = item_ct1.get_global_id(1);
++  const int64_t slot_idx = slot_mapping[token_idx];
++  if (slot_idx < 0) {
++    return;
++  }
++  const int64_t block_idx = slot_idx / block_size;
++  const int64_t block_offset = slot_idx % block_size;
++  // The thread is responsible for the HD elements within key/value
++  const scalar_t * key_head = key + token_idx * key_stride + head_idx * key_head_stride;
++
++  const scalar_t * value_head = value + token_idx * value_stride + head_idx * value_head_stride;
++
++  uint8_t * key_output_head = key_cache + block_idx * num_heads * head_size * block_size +
++      head_idx * head_size * block_size + block_offset * head_size;
++  uint8_t * value_output_head = value_cache + block_idx * num_heads * head_size * block_size +
++      head_idx * head_size * block_size + block_offset * head_size;
++
++  simd<fp16, HD> key_row = block_load<scalar_t, HD>(key_head);
++  simd<uint8_t, HD> key_result = quantize_key_row<HD>(key_row);
++  block_store<uint8_t, HD>(key_output_head, key_result);
++
++  simd<fp16, HD> value_row = block_load<scalar_t, HD>(value_head);
++  simd<uint8_t, HD> value_result = quantize_value_row<HD>(value_row);
++  block_store<uint8_t, HD>(value_output_head, value_result);
++}
++
++
++template <typename scalar_t, const int HD>
++void call_reshape_and_cache_ipexllm_kernel_fp8(
++    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
++    uint8_t* __restrict__ key_cache, uint8_t* __restrict__ value_cache,
++    const int64_t* __restrict__ slot_mapping, const int num_tokens,
++    const int key_stride, const int value_stride,
++    const int key_head_stride, const int value_head_stride,
++    const int num_heads,
++    const int head_size, const int block_size, const int x) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(num_tokens, num_heads, 1);
++  sycl::range<3> block(1, 1, 1);
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) SYCL_ESIMD_KERNEL {
++          reshape_and_cache_ipexllm_kernel_fp8<sycl_t, HD>(
++              (const sycl_t* __restrict__)key,
++              (const sycl_t* __restrict__)value,
++              (uint8_t* __restrict__)key_cache,
++              (uint8_t* __restrict__)value_cache, slot_mapping, key_stride,
++              value_stride, key_head_stride, value_head_stride,
++              num_heads, head_size, block_size, x, item_ct1);
++        });
++  });
++}
++
++void reshape_and_cache_ipexllm_fp8(torch::Tensor& key, torch::Tensor& value,
++                               torch::Tensor& key_cache,
++                               torch::Tensor& value_cache,
++                               torch::Tensor& slot_mapping,
++                               const std::string& kv_cache_dtype,
++                               const float kv_scale) {
++  int num_tokens = key.size(0);
++  int num_heads = key.size(1);
++  int head_size = key.size(2);
++  int block_size = key_cache.size(2);
++  // int x = key_cache.size(4);
++  int x = 1;
++
++  int key_stride = key.stride(0);
++  int value_stride = value.stride(0);
++
++  int key_head_stride = key.stride(1);
++  int value_head_stride = value.stride(1);
++
++  // This actually dispatches on scalar_type, we will then need to dispatch on Head Dim...
++switch (head_size) {
++  case 64:
++    VLLM_XPU_DISPATCH_FLOATING_TYPES(
++        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
++          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 64>(
++              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
++              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
++              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
++              value_stride, key_head_stride, value_head_stride, num_heads,
++              head_size, block_size, x);
++        });
++    break;
++  case 128:
++    VLLM_XPU_DISPATCH_FLOATING_TYPES(
++        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
++          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 128>(
++              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
++              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
++              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
++              value_stride, key_head_stride, value_head_stride, num_heads,
++              head_size, block_size, x);
++        });
++    break;
++  case 96:
++    VLLM_XPU_DISPATCH_FLOATING_TYPES(
++        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
++          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 96>(
++              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
++              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
++              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
++              value_stride, key_head_stride, value_head_stride, num_heads,
++              head_size, block_size, x);
++        });
++    break;
++  case 80:
++    VLLM_XPU_DISPATCH_FLOATING_TYPES(
++        key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
++          call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 80>(
++              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
++              key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
++              slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
++              value_stride, key_head_stride, value_head_stride, num_heads,
++              head_size, block_size, x);
++        });
++    break;
++  default:
++    TORCH_CHECK(false, "Unsupported head_dim: ", head_size);
++}
++  // VLLM_XPU_DISPATCH_FLOATING_TYPES(
++  //     key.scalar_type(), "call_reshape_and_cache_ipexllm_kernel_fp8", [&] {
++  //       call_reshape_and_cache_ipexllm_kernel_fp8<scalar_t, 128>(
++  //           key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
++  //           key_cache.data_ptr<uint8_t>(), value_cache.data_ptr<uint8_t>(),
++  //           slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
++  //           value_stride, key_head_stride, value_head_stride,
++  //           num_heads, head_size, block_size, x);
++  //     });
++}
++
++
++
+diff --git a/csrc/xpu/common.h b/csrc/xpu/common.h
+new file mode 100644
+index 000000000..17d6ef643
+--- /dev/null
++++ b/csrc/xpu/common.h
+@@ -0,0 +1,312 @@
++#pragma once
++
++#include <sycl.hpp>
++#include <torch/extension.h>
++
++typedef union half_t {
++    uint16_t u;
++    sycl::half f;
++} __half_t;
++
++typedef union ufloat32 {
++    unsigned u;
++    float f;
++} __float_t;
++
++#define QK4_0 64
++#define QR4_0 2
++#define QK4_1 64
++#define QR4_1 2
++#define QK5_0 64
++#define QR5_0 2
++#define QK5_1 64
++#define QR5_1 2
++#define QK8_0 64
++#define QR8_0 1
++#define QK8_1 32
++#define QR8_1 1
++#define QI8_1 (QK8_1 / (4 * QR8_1)) // 8
++#define QKFP8 64
++#define QRFP8 1
++#define QKFP6 64
++// for iq2 quantization
++#define WARP_SIZE 32
++#define QK_K 256
++#define QK4_K 32
++#define QR4_K 2
++#define QK6_K 16
++#define QKFP6_K 16
++#define QR2_XXS 8
++#define QI2_XXS (QK_K / (4*QR2_XXS)) // 8
++#define QR2_XS 8
++#define QI2_XS (QK_K / (4*QR2_XS)) // 8
++#define QR2_K 4
++#define QI2_K (QK_K / (4*QR2_K)) // 16
++#define QR1_S 8
++#define QI1_S (QK_K / (4*QR1_S)) // 8
++
++typedef struct {
++    sycl::half d;          // delta
++    uint8_t qs[QK4_0 / 2];    // nibbles / quants
++} block_q4_0;
++
++typedef struct {
++    uint8_t qs[QK4_0 / 2];    // nibbles / quants
++} block_q4_0_qs;
++
++typedef struct {
++    uint8_t qs[QK4_1 / 2];    // nibbles / quants
++} block_q4_1_qs;
++
++typedef struct {
++    sycl::half d;              // delta
++    sycl::half m;              // min
++    uint8_t qs[QK4_1 / 2];     // nibbles / quants
++} block_q4_1;
++
++typedef struct {
++    sycl::half d;
++    uint8_t qh[8];
++    uint8_t qs[QK5_0 / 2];
++} block_q5_0;
++
++typedef struct {
++    sycl::half d;          // delta
++    sycl::half m;          // min
++    uint8_t qh[8];         // 5-th bit of quants
++    uint8_t qs[QK5_1 / 2]; // nibbles / quants
++} block_q5_1;
++
++typedef struct {
++    sycl::half d;           // delta
++    uint8_t qh[8];          // 3-th bit of quants
++    uint8_t qs[QK4_0 / 4];  // nibbles / quants
++} block_nf3;
++
++typedef struct {
++    uint8_t qh[8];          // 3-th bit of quants
++    uint8_t qs[QK4_0 / 4];  // nibbles / quants
++} block_nf3_qs;
++
++typedef struct {
++    float d;       // delta
++    int8_t qs[QK8_0];   // quants
++} block_q8_0;
++
++typedef struct {
++    int8_t qs[QK8_0];   // quants
++} block_q8_0_qs;
++
++typedef struct {
++    sycl::half d;
++    sycl::half sum;
++    int8_t  qs[QK8_1];      // quants
++} block_q8_1;
++
++typedef struct {
++    uint8_t qs[QKFP8];
++} block_fp8_qs;
++
++typedef struct {
++    float d;
++    uint8_t qs[QKFP8];
++} block_fp8;
++
++typedef struct {
++    sycl::half d;
++    uint16_t qs[QK_K/8]; // 32
++} block_iq2_xxs;
++
++typedef struct {
++    sycl::half d;
++    uint16_t qs[QK_K/8]; // 32
++    uint8_t  scales[QK_K/32]; // 8
++} block_iq2_xs;
++
++typedef struct {
++    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
++    uint8_t qs[QK_K/4];      // quants
++    sycl::half d;            // super-block scale for quantized scales
++    sycl::half min;          // super-block min for quantized mins
++} block_q2_K;
++
++typedef struct {
++    sycl::half d;                 // super-block scale for quantized scales
++    sycl::half dmin;              // super-block scale for quantized mins
++    uint8_t scales[16];           // scales and mins, quantized with 8 bits
++    uint8_t qs[QK_K/2];           // 4--bit quants
++} block_q4_K;
++
++typedef struct {
++    uint8_t qs[QK_K/2];            // 4-bit quants
++} block_q4_K_qs;
++
++typedef struct {
++    uint8_t qs[QK4_K/2];            // 4-bit quants
++} block_q4_K_qs_block;
++
++typedef struct {
++    uint8_t scales[16];            // scales and mins, quantized with 8 bits
++} block_q4_K_scales;
++
++typedef struct {
++    sycl::half d;               // super-block scale for quantized scales
++    sycl::half dmin;            // super-block scale for quantized mins
++    uint8_t scales[12];         // scales and mins, quantized with 6 bits
++    uint8_t qh[QK_K/8];          // quants, high bit
++    uint8_t qs[QK_K/2];          // quants, low 4 bits
++} block_q5_K;
++
++typedef struct {
++    uint8_t ql[QK_K/2];   // quants, lower 4 bits
++    uint8_t qh[QK_K/4];   // quants, upper 2 bits
++    int8_t  scales[QK_K/16]; // scales
++    sycl::half d;            // delta
++} block_q6_K;
++
++typedef struct {
++    uint32_t qh[QK_K/16];      // quants, upper 2 bits
++} block_q6_K_qh;
++
++typedef struct {
++    uint32_t ql[QK_K/8];      // quants, lower 4 bits
++} block_q6_K_ql;
++
++typedef struct {
++    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
++} block_q6_K_scales;
++
++typedef struct {
++    uint8_t ql[QK_K/2];       // quants, lower 4 bits
++    uint8_t qh[QK_K/4];       // quants, upper 2 bits
++    int8_t  scales[QK_K/16];  // scales, quantized with 8 bits
++    sycl::half d;            // super-block scale
++} block_fp6_K;
++static_assert(sizeof(block_fp6_K) == sizeof(sycl::half) + QK_K / 16 + 3*QK_K/4, "wrong fp6_K block size/padding");
++
++typedef struct {
++    uint32_t ql[QK_K/8];      // quants, lower 4 bits
++} block_fp6_k_ql;
++
++typedef struct {
++    uint32_t qh[QK_K/16];     // quants, upper 2 bits
++} block_fp6_k_qh;
++
++typedef struct {
++    int8_t scales[QK_K/16];  // scales, quantized with 8 bits, 16
++} block_fp6_k_scales;
++
++typedef struct {
++    uint32_t ql[QKFP6_K/8];     // upper 2 bits, 2
++} block_base_fp6_k_ql;
++
++typedef struct {
++    uint32_t qh[QKFP6_K/16];     // upper 2 bits, 1
++} block_base_fp6_k_qh;
++
++#define NGRID_IQ1S 2048
++#define IQ1S_DELTA 0.125f
++#define IQ1M_DELTA 0.125f
++
++typedef struct {
++    sycl::half d;
++    uint8_t  qs[QK_K/8];
++    uint16_t qh[QK_K/32];
++} block_iq1_s;
++
++// 1.8125 bpw
++typedef struct {
++    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
++    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
++    uint8_t  scales[QK_K/32]; // 4-bit block scales
++} block_iq1_m;
++
++typedef struct {
++    uint8_t ql[QKFP6/2];      // lower 4 bits, 32
++    uint8_t qh[QKFP6/4];      // upper 2 bits, 16
++    sycl::half  d;            // delta
++} block_fp6;
++
++typedef struct {
++    uint32_t qh[QKFP6/16];     // upper 2 bits, 4
++} block_fp6_32_qh;
++
++typedef struct {
++    uint32_t ql[QKFP6/8];      // lower 4 bits, 8
++} block_fp6_32_ql;
++
++enum ggml_type {
++    GGML_TYPE_Q4_0 = 2,
++    GGML_TYPE_Q4_1 = 3,
++    GGML_TYPE_Q5_0 = 6,
++    GGML_TYPE_Q5_1 = 7,
++    GGML_TYPE_Q8_0 = 8,
++    GGML_TYPE_Q8_1 = 9,
++    GGML_TYPE_NF4 = 10,
++    GGML_TYPE_NF3 = 11,
++    GGML_TYPE_FP8E4 = 15,
++    GGML_TYPE_FP4 = 16,
++    GGML_TYPE_FP8E5 = 19,
++    GGML_TYPE_IQ2_XXS = 21,
++    GGML_TYPE_IQ2_XS = 22,
++    GGML_TYPE_Q2_K = 23,
++    GGML_TYPE_IQ1_S = 24,
++    GGML_TYPE_IQ1_M = 25,
++    GGML_TYPE_Q6_K = 26,
++    GGML_TYPE_Q4_K = 27,
++    GGML_TYPE_Q5_K = 28,
++    GGML_TYPE_FP6 = 29,
++    GGML_TYPE_FP6_K = 30,
++    GGML_TYPE_Q4_0_WOQ = 34,
++    GGML_TYPE_COUNT
++};
++
++static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
++    [GGML_TYPE_Q4_0] = QK4_0,
++    [GGML_TYPE_Q4_1] = QK4_1,
++    [GGML_TYPE_Q5_0] = QK5_0,
++    [GGML_TYPE_Q5_1] = QK5_1,
++    [GGML_TYPE_NF4]  = QK4_0,
++    [GGML_TYPE_NF3]  = QK4_0,
++    [GGML_TYPE_Q8_0] = QK8_0,
++    [GGML_TYPE_Q8_1] = QK8_1,
++    [GGML_TYPE_FP8E4]  = QKFP8,
++    [GGML_TYPE_FP4]  = QK4_0,
++    [GGML_TYPE_FP6]  = QKFP6,
++    [GGML_TYPE_FP8E5]  = QKFP8,
++    [GGML_TYPE_IQ2_XXS] = QK_K,
++    [GGML_TYPE_IQ2_XS] = QK_K,
++    [GGML_TYPE_Q2_K] = QK_K,
++    [GGML_TYPE_IQ1_S] = QK_K,
++    [GGML_TYPE_IQ1_M] = QK_K,
++    [GGML_TYPE_Q6_K] = QK_K,
++    [GGML_TYPE_Q4_K] = QK_K,
++    [GGML_TYPE_Q5_K] = QK_K,
++    [GGML_TYPE_FP6_K] = QK_K,
++    [GGML_TYPE_Q4_0_WOQ] = QK4_0,
++};
++
++static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
++    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
++    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
++    [GGML_TYPE_Q5_0] = sizeof(block_q5_1),
++    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
++    [GGML_TYPE_NF4]  = sizeof(block_q4_0),
++    [GGML_TYPE_NF3]  = sizeof(block_nf3),
++    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
++    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
++    [GGML_TYPE_FP8E4]= sizeof(block_fp8),
++    [GGML_TYPE_FP4]  = sizeof(block_q4_0),
++    [GGML_TYPE_FP6]  = sizeof(block_fp6),
++    [GGML_TYPE_FP8E5]  = sizeof(block_fp8),
++    [GGML_TYPE_IQ2_XXS] = sizeof(block_iq2_xxs),
++    [GGML_TYPE_IQ2_XS] = sizeof(block_iq2_xs),
++    [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
++    [GGML_TYPE_IQ1_S] = sizeof(block_iq1_s),
++    [GGML_TYPE_IQ1_M] = sizeof(block_iq1_m),
++    [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
++    [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
++    [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
++    [GGML_TYPE_FP6_K] = sizeof(block_fp6_K),
++    [GGML_TYPE_Q4_0_WOQ] = sizeof(block_q4_0),
++};
+diff --git a/csrc/xpu/dequantize.h b/csrc/xpu/dequantize.h
+new file mode 100644
+index 000000000..9a967312e
+--- /dev/null
++++ b/csrc/xpu/dequantize.h
+@@ -0,0 +1,74 @@
++#include <dpct/dpct.hpp>
++#include <sycl/sycl.hpp>
++#include "utils.h"
++/*
++Adapted from https://github.com/mit-han-lab/llm-awq
++Modified from NVIDIA FasterTransformer:
++https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++@article{lin2023awq,
++  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
++Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
++Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
++}
++*/
++
++#pragma once
++
++namespace vllm {
++namespace awq {
++
++sycl::uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
++  sycl::uint4 result;
++
++  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
++  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
++
++  // First, we extract the i4s and construct an intermediate fp16 number.
++  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
++  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
++  static constexpr uint32_t TOP_MASK = 0x00f000f0;
++  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
++
++  // Note that the entire sequence only requires 1 shift instruction. This is
++  // thanks to the register packing format and the fact that we force our
++  // integers to be unsigned, and account for this in the fp16 subtractions. In
++  // addition, I exploit the fact that sub and fma have the same throughput in
++  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
++  // the bottom bits before hand.
++
++  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
++  // dependency if we issue immediately before required.
++  const uint32_t top_i4s = i4s >> 8;
++  h[0] = (i4s & BOTTOM_MASK) | I4s_TO_F16s_MAGIC_NUM;
++  h[1] = (i4s & TOP_MASK) | I4s_TO_F16s_MAGIC_NUM;
++  h[2] = (top_i4s & BOTTOM_MASK) | I4s_TO_F16s_MAGIC_NUM;
++  h[3] = (top_i4s & TOP_MASK) | I4s_TO_F16s_MAGIC_NUM;
++
++  // This is the half2 {1032, 1032} represented as an integer.
++  // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
++  // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
++  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
++  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
++  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
++  // This is the half2 {-72, -72} represented as an integer.
++  // static constexpr uint32_t NEG_72 = 0xd480d480;
++  // Haotian: Let's use {-64, -64}.
++  static constexpr uint32_t NEG_64 = 0xd400d400;
++  *(sycl::half2*)(&h[0]) = sycl_half_sub2(
++      *(sycl::half2*)(&h[0]), *(sycl::half2*)(&FP16_TOP_MAGIC_NUM));
++  *(sycl::half2*)(&h[1]) = sycl_half_fma2(
++      *(sycl::half2*)(&h[1]),
++      *(sycl::half2*)(&ONE_SIXTEENTH),
++      *(sycl::half2*)(&NEG_64));
++  *(sycl::half2*)(&h[2]) = sycl_half_sub2(
++      *(sycl::half2*)(&h[2]), *(sycl::half2*)(&FP16_TOP_MAGIC_NUM));
++  *(sycl::half2*)(&h[3]) = sycl_half_fma2(
++      *(sycl::half2*)(&h[3]),
++      *(sycl::half2*)(&ONE_SIXTEENTH),
++      *(sycl::half2*)(&NEG_64));
++
++  return result;
++}
++
++} // namespace awq
++} // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/xpu/dtype_float16.h b/csrc/xpu/dtype_float16.h
+new file mode 100644
+index 000000000..1b9c1f248
+--- /dev/null
++++ b/csrc/xpu/dtype_float16.h
+@@ -0,0 +1,458 @@
++/*
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * and
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Copyright (c) 2023, The vLLM team.
++ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#pragma once
++
++#include <dpct/dpct.hpp>
++#include <sycl/sycl.hpp>
++#include "attention_generic.h"
++#include "dtype_float32.h"
++#include "utils.h"
++
++#include <stdint.h>
++
++namespace vllm {
++
++// FP16 vector types for Q, K, V.
++template <>
++struct Vec<sycl::half, 1> {
++  using Type = sycl::half;
++};
++template <>
++struct Vec<sycl::half, 2> {
++  using Type = sycl::half2;
++};
++template <>
++struct Vec<sycl::half, 4> {
++  using Type = sycl::half4;
++};
++template <>
++struct Vec<sycl::half, 8> {
++  using Type = sycl::half8;
++};
++
++template <>
++struct FloatVec<sycl::half> {
++  using Type = float;
++};
++template <>
++struct FloatVec<sycl::half2> {
++  using Type = sycl::float2;
++};
++
++template <>
++struct FloatVec<sycl::half4> {
++  using Type = Float4_;
++};
++template <>
++struct FloatVec<sycl::half8> {
++  using Type = Float8_;
++};
++
++// Utility functions for type conversions.
++inline sycl::half2 h0_h0(sycl::half a) {
++  return sycl::half2{a, a};
++}
++
++inline float half_to_float(sycl::half h) {
++  return float(h);
++}
++
++inline sycl::float2 half2_to_float2(sycl::half2 v) {
++
++  return sycl::float2(half_to_float(v.x()), half_to_float(v.y()));
++}
++
++inline sycl::half float_to_half(float f) {
++  return sycl::half(f);
++}
++
++inline sycl::half2 float2_to_half2(sycl::float2 f) {
++  return sycl::half2{float_to_half(f.x()), float_to_half(f.y())};
++}
++
++// Vector addition.
++inline sycl::half add(sycl::half a, sycl::half b) {
++  return sycl_half_add(a,b);
++}
++
++inline sycl::half2 add(sycl::half2 a, sycl::half2 b) {
++  auto val = sycl_half_add2(a, b);
++  return (val);
++}
++
++inline sycl::half4 add(sycl::half4 a, sycl::half4 b) {
++  sycl::half4 c;
++  c.x() = add(a.x(), b.x());
++  c.y() = add(a.y(), b.y());
++  c.z() = add(a.z(), b.z());
++  c.w() = add(a.w(), b.w());
++  return c;
++}
++
++inline sycl::half8 add(sycl::half8 a, sycl::half8 b) {
++  sycl::half8 c;
++  c.s0() = add(a.s0(), b.s0());
++  c.s1() = add(a.s1(), b.s1());
++  c.s2() = add(a.s2(), b.s2());
++  c.s3() = add(a.s3(), b.s3());
++  c.s4() = add(a.s4(), b.s4());
++  c.s5() = add(a.s5(), b.s5());
++  c.s6() = add(a.s6(), b.s6());
++  c.s7() = add(a.s7(), b.s7());
++  return c;
++}
++
++inline sycl::float2 add(sycl::half2 a, sycl::float2 fb) {
++  sycl::float2 fa = half2_to_float2(a);
++  return add(fa, fb);
++}
++
++inline Float4_ add(sycl::half4 a, Float4_ fb) {
++  Float4_ fc;
++  fc.x = add(sycl::half2{a.x(), a.y()}, fb.x);
++  fc.y = add(sycl::half2{a.z(), a.w()}, fb.y);
++  return fc;
++}
++
++inline Float8_ add(sycl::half8 a, Float8_ fb) {
++  Float8_ fc;
++  fc.x = add(sycl::half2{a.s0(), a.s1()}, fb.x);
++  fc.y = add(sycl::half2{a.s2(), a.s3()}, fb.y);
++  fc.z = add(sycl::half2{a.s4(), a.s5()}, fb.z);
++  fc.w = add(sycl::half2{a.s6(), a.s7()}, fb.w);
++  return fc;
++}
++
++// Vector multiplication.
++template <>
++inline sycl::half mul(sycl::half a, sycl::half b) {
++  auto val = sycl_half_mul((a), (b));
++  return (val);
++}
++
++template <>
++inline sycl::half2 mul(sycl::half2 a, sycl::half2 b) {
++  auto val = sycl_half_mul2((a), (b));
++  return (val);
++}
++
++template <>
++inline sycl::half2 mul(sycl::half a, sycl::half2 b) {
++  return mul<sycl::half2, sycl::half2, sycl::half2>(h0_h0(a), b);
++}
++
++
++template <>
++inline sycl::half4 mul(sycl::half4 a, sycl::half4 b) {
++  sycl::half4 c;
++  c.x() = mul<sycl::half, sycl::half, sycl::half>(a.x(), b.x());
++  c.y() = mul<sycl::half, sycl::half, sycl::half>(a.y(), b.y());
++  c.z() = mul<sycl::half, sycl::half, sycl::half>(a.z(), b.z());
++  c.w() = mul<sycl::half, sycl::half, sycl::half>(a.w(), b.w());
++  return c;
++}
++
++template <>
++inline sycl::half4 mul(sycl::half a, sycl::half4 b) {
++  sycl::half4 c;
++  c.x() = mul<sycl::half, sycl::half, sycl::half>(a, b.x());
++  c.y() = mul<sycl::half, sycl::half, sycl::half>(a, b.y());
++  c.z() = mul<sycl::half, sycl::half, sycl::half>(a, b.z());
++  c.w() = mul<sycl::half, sycl::half, sycl::half>(a, b.w());
++  return c;
++}
++
++template <>
++inline sycl::half8 mul(sycl::half8 a, sycl::half8 b) {
++  sycl::half8 c;
++  c.s0() = mul<sycl::half, sycl::half, sycl::half>(a.s0(), b.s0());
++  c.s1() = mul<sycl::half, sycl::half, sycl::half>(a.s1(), b.s1());
++  c.s2() = mul<sycl::half, sycl::half, sycl::half>(a.s2(), b.s2());
++  c.s3() = mul<sycl::half, sycl::half, sycl::half>(a.s3(), b.s3());
++  c.s4() = mul<sycl::half, sycl::half, sycl::half>(a.s4(), b.s4());
++  c.s5() = mul<sycl::half, sycl::half, sycl::half>(a.s5(), b.s5());
++  c.s6() = mul<sycl::half, sycl::half, sycl::half>(a.s6(), b.s6());
++  c.s7() = mul<sycl::half, sycl::half, sycl::half>(a.s7(), b.s7());
++  return c;
++}
++
++template <>
++inline sycl::half8 mul(sycl::half a, sycl::half8 b) {
++  sycl::half8 c;
++  c.s0() = mul<sycl::half, sycl::half, sycl::half>(a, b.s0());
++  c.s1() = mul<sycl::half, sycl::half, sycl::half>(a, b.s1());
++  c.s2() = mul<sycl::half, sycl::half, sycl::half>(a, b.s2());
++  c.s3() = mul<sycl::half, sycl::half, sycl::half>(a, b.s3());
++  c.s4() = mul<sycl::half, sycl::half, sycl::half>(a, b.s4());
++  c.s5() = mul<sycl::half, sycl::half, sycl::half>(a, b.s5());
++  c.s6() = mul<sycl::half, sycl::half, sycl::half>(a, b.s6());
++  c.s7() = mul<sycl::half, sycl::half, sycl::half>(a, b.s7());
++  return c;
++}
++
++template <>
++inline float mul(sycl::half a, sycl::half b) {
++  float fa = half_to_float(a);
++  float fb = half_to_float(b);
++  return fa * fb;
++}
++
++template <>
++inline sycl::float2 mul(sycl::half2 a, sycl::half2 b) {
++  sycl::float2 fa = half2_to_float2(a);
++  sycl::float2 fb = half2_to_float2(b);
++  return mul<sycl::float2, sycl::float2, sycl::float2>(fa, fb);
++}
++
++template <>
++inline sycl::float2 mul(sycl::half a, sycl::half2 b) {
++  return mul<sycl::float2, sycl::half2, sycl::half2>(h0_h0(a), b);
++}
++
++template <>
++inline Float4_ mul(sycl::half4 a, sycl::half4 b) {
++  Float4_ fc;
++  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.x(), a.y()}, sycl::half2{b.x(), b.y()});
++  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.z(), a.w()}, sycl::half2{b.z(), b.w()});
++  return fc;
++}
++
++template <>
++inline Float4_ mul(sycl::half a, sycl::half4 b) {
++  sycl::half2 s = h0_h0(a);
++  Float4_ fc;
++
++  fc.x =
++      mul<sycl::float2, sycl::half2, sycl::half2>(s, sycl::half2{b.x(), b.y()});
++  fc.y =
++      mul<sycl::float2, sycl::half2, sycl::half2>(s, sycl::half2{b.z(), b.w()});
++  return fc;
++}
++
++template <>
++inline Float8_ mul(sycl::half8 a, sycl::half8 b) {
++  Float8_ fc;
++  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.s0(), a.s1()}, sycl::half2{b.s0(), b.s1()});
++  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.s2(), a.s3()}, sycl::half2{b.s2(), b.s3()});
++  fc.z = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.s4(), a.s5()}, sycl::half2{b.s4(), b.s5()});
++  fc.w = mul<sycl::float2, sycl::half2, sycl::half2>(
++      sycl::half2{a.s6(), a.s7()}, sycl::half2{b.s6(), b.s7()});
++  return fc;
++}
++
++template <>
++inline Float8_ mul(sycl::half a, sycl::half8 b) {
++  sycl::half2 s = h0_h0(a);
++  Float8_ fc;
++  fc.x = mul<sycl::float2, sycl::half2, sycl::half2>(
++      s, sycl::half2{b.s0(), b.s1()});
++  fc.y = mul<sycl::float2, sycl::half2, sycl::half2>(
++      s, sycl::half2{b.s2(), b.s3()});
++  fc.z = mul<sycl::float2, sycl::half2, sycl::half2>(
++      s, sycl::half2{b.s4(), b.s5()});
++  fc.w = mul<sycl::float2, sycl::half2, sycl::half2>(
++      s, sycl::half2{b.s6(), b.s7()});
++  return fc;
++}
++
++// Vector fused multiply-add.
++inline sycl::half2 fma(sycl::half2 a, sycl::half2 b, sycl::half2 c) {
++  auto val = sycl_half_fma2((a), (b), (c));
++  return (val);
++}
++
++inline sycl::half2 fma(sycl::half a, sycl::half2 b, sycl::half2 c) {
++  return fma(h0_h0(a), b, c);
++}
++
++inline sycl::half4 fma(sycl::half4 a, sycl::half4 b, sycl::half4 c) {
++  sycl::half4 d;
++  d.x() = fma(a.x(), b.x(), c.x());
++  d.y() = fma(a.y(), b.y(), c.y());
++  d.z() = fma(a.z(), b.z(), c.z());
++  d.w() = fma(a.w(), b.w(), c.w());
++  return d;
++}
++
++inline sycl::half4 fma(sycl::half a, sycl::half4 b, sycl::half4 c) {
++  sycl::half4 s = sycl::half4{a, a, a, a};
++  return fma(s, b, c);
++}
++
++inline sycl::half8 fma(sycl::half8 a, sycl::half8 b, sycl::half8 c) {
++  sycl::half8 d;
++  d.s0() = fma(a.s0(), b.s0(), c.s0());
++  d.s1() = fma(a.s1(), b.s1(), c.s1());
++  d.s2() = fma(a.s2(), b.s2(), c.s2());
++  d.s3() = fma(a.s3(), b.s3(), c.s3());
++  d.s4() = fma(a.s4(), b.s4(), c.s4());
++  d.s5() = fma(a.s5(), b.s5(), c.s5());
++  d.s6() = fma(a.s6(), b.s6(), c.s6());
++  d.s7() = fma(a.s7(), b.s7(), c.s7());
++  return d;
++}
++
++inline sycl::half8 fma(sycl::half a, sycl::half8 b, sycl::half8 c) {
++  sycl::half8 d;
++  d.s0() = fma(a, b.s0(), c.s0());
++  d.s1() = fma(a, b.s1(), c.s1());
++  d.s2() = fma(a, b.s2(), c.s2());
++  d.s3() = fma(a, b.s3(), c.s3());
++  d.s4() = fma(a, b.s4(), c.s4());
++  d.s5() = fma(a, b.s5(), c.s5());
++  d.s6() = fma(a, b.s6(), c.s6());
++  d.s7() = fma(a, b.s7(), c.s7());
++  return d;
++}
++
++inline float fma(sycl::half a, sycl::half b, float fc) {
++  float fa = half_to_float(a);
++  float fb = half_to_float(b);
++  return sycl::fma(fa, fb, fc);
++}
++
++inline sycl::float2 fma(sycl::half2 a, sycl::half2 b, sycl::float2 fc) {
++  sycl::float2 fa = half2_to_float2(a);
++  sycl::float2 fb = half2_to_float2(b);
++  return fma(fa, fb, fc);
++}
++
++inline sycl::float2 fma(sycl::half a, sycl::half2 b, sycl::float2 fc) {
++  return fma(h0_h0(a), b, fc);
++}
++
++inline Float4_ fma(sycl::half4 a, sycl::half4 b, Float4_ fc) {
++  Float4_ fd;
++  fd.x = fma(sycl::half2{a.x(), a.y()}, sycl::half2{b.x(), b.y()}, fc.x);
++  fd.y = fma(sycl::half2{a.z(), a.w()}, sycl::half2{b.z(), b.w()}, fc.y);
++  return fd;
++}
++
++inline Float4_ fma(sycl::half a, sycl::half4 b, Float4_ fc) {
++  sycl::half4 s = sycl::half4{a, a, a, a};
++
++  return fma(s, b, fc);
++}
++
++inline Float8_ fma(sycl::half8 a, sycl::half8 b, Float8_ fc) {
++  Float8_ fd;
++  fd.x = fma(sycl::half2{a.s0(), a.s1()}, sycl::half2{b.s0(), b.s1()}, fc.x);
++  fd.y = fma(sycl::half2{a.s2(), a.s3()}, sycl::half2{b.s2(), b.s3()}, fc.y);
++  fd.z = fma(sycl::half2{a.s4(), a.s5()}, sycl::half2{b.s4(), b.s5()}, fc.z);
++  fd.w = fma(sycl::half2{a.s6(), a.s7()}, sycl::half2{b.s6(), b.s7()}, fc.w);
++  return fd;
++}
++
++inline Float8_ fma(sycl::half a, sycl::half8 b, Float8_ fc) {
++  sycl::half8 s = sycl::half8{a, a, a, a, a, a, a, a};
++
++  return fma(s, b, fc);
++}
++
++// Vector sum.
++template <>
++inline float sum(sycl::half v) {
++  return half_to_float(v);
++}
++
++template <>
++inline float sum(sycl::half2 v) {
++  sycl::float2 tmp = half2_to_float2(v);
++  return tmp.x() + tmp.y();
++}
++
++template <>
++inline float sum(sycl::half4 v) {
++  sycl::half2 c = add(sycl::half2{v.x(), v.y()}, sycl::half2{v.z(), v.w()});
++  return sum(c);
++}
++
++template <>
++inline float sum(sycl::half8 v) {
++  return add(
++      sum(sycl::half4{v.s0(), v.s1(), v.s2(), v.s3()}),
++      sum(sycl::half4{v.s4(), v.s5(), v.s6(), v.s7()}));
++}
++
++inline void from_float(sycl::half& dst, float src) {
++  dst = sycl::half(src);
++}
++
++inline void from_float(sycl::half2& dst, sycl::float2 src) {
++  dst = float2_to_half2(src);
++}
++
++inline void from_float(sycl::half4& dst, Float4_ src) {
++  sycl::half2 h0 = float2_to_half2(src.x);
++  sycl::half2 h1 = float2_to_half2(src.y);
++  dst.x() = h0.x();
++  dst.y() = h0.y();
++  dst.z() = h1.x();
++  dst.w() = h1.y();
++}
++
++inline void from_float(sycl::half8& dst, Float8_ src) {
++  dst.s0() = float2_to_half2(src.x).x();
++  dst.s1() = float2_to_half2(src.x).y();
++  dst.s2() = float2_to_half2(src.y).x();
++  dst.s3() = float2_to_half2(src.y).y();
++  dst.s4() = float2_to_half2(src.z).x();
++  dst.s5() = float2_to_half2(src.z).y();
++  dst.s6() = float2_to_half2(src.w).x();
++  dst.s7() = float2_to_half2(src.w).y();
++}
++
++// From float16 to float32.
++inline float to_float(sycl::half u) {
++  return half_to_float(u);
++}
++
++inline sycl::float2 to_float(sycl::half2 u) {
++  return half2_to_float2(u);
++}
++
++inline Float4_ to_float(sycl::half4 u) {
++  Float4_ tmp;
++  tmp.x = half2_to_float2(sycl::half2{u.x(), u.y()});
++  tmp.y = half2_to_float2(sycl::half2{u.z(), u.w()});
++  return tmp;
++}
++
++inline Float8_ to_float(sycl::half8 u) {
++  Float8_ tmp;
++  tmp.x = half2_to_float2(sycl::half2{u.s0(), u.s1()});
++  tmp.y = half2_to_float2(sycl::half2{u.s2(), u.s3()});
++  tmp.z = half2_to_float2(sycl::half2{u.s4(), u.s5()});
++  tmp.w = half2_to_float2(sycl::half2{u.s6(), u.s7()});
++  return tmp;
++}
++
++// Zero-out a variable.
++inline void zero(sycl::half& dst) {
++  dst = sycl::half(0);
++}
++
++} // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/xpu/dtype_float32.h b/csrc/xpu/dtype_float32.h
+new file mode 100644
+index 000000000..7b70e4efc
+--- /dev/null
++++ b/csrc/xpu/dtype_float32.h
+@@ -0,0 +1,268 @@
++/*
++ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Copyright (c) 2023, The vLLM team.
++ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#pragma once
++
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++#include "attention_generic.h"
++
++#include <stdint.h>
++
++namespace vllm {
++
++// Define custom FP32 vector data types.
++struct Float4_ {
++  sycl::float2 x;
++  sycl::float2 y;
++};
++
++struct Float8_ {
++  sycl::float2 x;
++  sycl::float2 y;
++  sycl::float2 z;
++  sycl::float2 w;
++};
++
++// FP32 vector types for Q, K, V.
++template<>
++struct Vec<float, 1> {
++  using Type = float;
++};
++template<>
++struct Vec<float, 2> {
++  using Type = sycl::float2;
++};
++template<>
++struct Vec<float, 4> {
++  using Type = sycl::float4;
++};
++
++// FP32 accumulator vector types corresponding to Vec.
++template<>
++struct FloatVec<float> {
++  using Type = float;
++};
++template <> struct FloatVec<sycl::float2> {
++  using Type = sycl::float2;
++};
++template <> struct FloatVec<sycl::float4> {
++  using Type = sycl::float4;
++};
++
++// Vector addition.
++inline float add(float a, float b) {
++  return a + b;
++}
++
++inline sycl::float2 add(sycl::float2 a, sycl::float2 b) {
++  sycl::float2 c;
++  c.x() = add(a.x(), b.x());
++  c.y() = add(a.y(), b.y());
++  return c;
++}
++
++inline sycl::float4 add(sycl::float4 a, sycl::float4 b) {
++  sycl::float4 c;
++  c.x() = add(a.x(), b.x());
++  c.y() = add(a.y(), b.y());
++  c.z() = add(a.z(), b.z());
++  c.w() = add(a.w(), b.w());
++  return c;
++}
++
++// Vector multiplication.
++template<>
++inline float mul<float, float>(float a, float b) {
++  return a * b;
++}
++
++template <> inline sycl::float2 mul(sycl::float2 a, sycl::float2 b) {
++  sycl::float2 c;
++  c.x() = a.x() * b.x();
++  c.y() = a.y() * b.y();
++  return c;
++}
++
++template <> inline sycl::float2 mul(float a, sycl::float2 b) {
++  sycl::float2 c;
++  c.x() = a * b.x();
++  c.y() = a * b.y();
++  return c;
++}
++
++template <> inline sycl::float4 mul(sycl::float4 a, sycl::float4 b) {
++  sycl::float4 c;
++  c.x() = a.x() * b.x();
++  c.y() = a.y() * b.y();
++  c.z() = a.z() * b.z();
++  c.w() = a.w() * b.w();
++  return c;
++}
++
++template <> inline sycl::float4 mul(float a, sycl::float4 b) {
++  sycl::float4 c;
++  c.x() = a * b.x();
++  c.y() = a * b.y();
++  c.z() = a * b.z();
++  c.w() = a * b.w();
++  return c;
++}
++
++// Vector fused multiply-add.
++inline float fma(float a, float b, float c) {
++  return a * b + c;
++}
++
++inline sycl::float2 fma(sycl::float2 a, sycl::float2 b, sycl::float2 c) {
++  sycl::float2 d;
++  d.x() = fma(a.x(), b.x(), c.x());
++  d.y() = fma(a.y(), b.y(), c.y());
++  return d;
++}
++
++inline sycl::float2 fma(float a, sycl::float2 b, sycl::float2 c) {
++  sycl::float2 d;
++  d.x() = fma(a, b.x(), c.x());
++  d.y() = fma(a, b.y(), c.y());
++  return d;
++}
++
++inline sycl::float4 fma(sycl::float4 a, sycl::float4 b, sycl::float4 c) {
++  sycl::float4 d;
++  d.x() = fma(a.x(), b.x(), c.x());
++  d.y() = fma(a.y(), b.y(), c.y());
++  d.z() = fma(a.z(), b.z(), c.z());
++  d.w() = fma(a.w(), b.w(), c.w());
++  return d;
++}
++
++inline sycl::float4 fma(float a, sycl::float4 b, sycl::float4 c) {
++  sycl::float4 d;
++  d.x() = fma(a, b.x(), c.x());
++  d.y() = fma(a, b.y(), c.y());
++  d.z() = fma(a, b.z(), c.z());
++  d.w() = fma(a, b.w(), c.w());
++  return d;
++}
++
++inline Float4_ fma(float a, Float4_ b, Float4_ c) {
++  Float4_ d;
++  d.x = fma(a, b.x, c.x);
++  d.y = fma(a, b.y, c.y);
++  return d;
++}
++
++inline Float8_ fma(float a, Float8_ b, Float8_ c) {
++  Float8_ d;
++  d.x = fma(a, b.x, c.x);
++  d.y = fma(a, b.y, c.y);
++  d.z = fma(a, b.z, c.z);
++  d.w = fma(a, b.w, c.w);
++  return d;
++}
++
++// Vector sum.
++template<>
++inline float sum(float v) {
++  return v;
++}
++
++template <> inline float sum(sycl::float2 v) {
++  return v.x() + v.y();
++}
++
++template <> inline float sum(sycl::float4 v) {
++  return v.x() + v.y() + v.z() + v.w();
++}
++
++template<>
++inline float sum(Float4_ v) {
++  return v.x.x() + v.x.y() + v.y.x() + v.y.y();
++}
++
++template<>
++inline float sum(Float8_ v) {
++  return v.x.x() + v.x.y() + v.y.x() + v.y.y() + v.z.x() + v.z.y() + v.w.x() +
++         v.w.y();
++}
++
++// Vector dot product.
++inline float dot(float a, float b) {
++  return a * b;
++}
++
++inline float dot(sycl::float2 a, sycl::float2 b) {
++  sycl::float2 c = mul<sycl::float2, sycl::float2, sycl::float2>(a, b);
++  return c.x() + c.y();
++}
++
++inline float dot(Float4_ a, Float4_ b) {
++  sycl::float2 acc = mul<sycl::float2, sycl::float2, sycl::float2>(a.x, b.x);
++  acc = fma(a.y, b.y, acc);
++  return acc.x() + acc.y();
++}
++
++inline float dot(Float8_ a, Float8_ b) {
++  sycl::float2 acc = mul<sycl::float2, sycl::float2, sycl::float2>(a.x, b.x);
++  acc = fma(a.y, b.y, acc);
++  acc = fma(a.z, b.z, acc);
++  acc = fma(a.w, b.w, acc);
++  return acc.x() + acc.y();
++}
++
++// From float to float.
++inline void from_float(float& dst, float src) {
++  dst = src;
++}
++
++inline void from_float(sycl::float2 &dst, sycl::float2 src) {
++  dst = src;
++}
++
++inline void from_float(sycl::float4 &dst, sycl::float4 src) {
++  dst = src;
++}
++
++// From float to float.
++inline float to_float(float u) {
++  return u;
++}
++
++inline sycl::float2 to_float(sycl::float2 u) {
++  return u;
++}
++
++inline sycl::float4 to_float(sycl::float4 u) {
++  return u;
++}
++
++inline Float4_ to_float(Float4_ u) {
++  return u;
++}
++
++inline Float8_ to_float(Float8_ u) {
++  return u;
++}
++
++// Zero-out a variable.
++inline void zero(float& dst) {
++  dst = 0.f;
++}
++
++} // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/xpu/fused_moe.cpp b/csrc/xpu/fused_moe.cpp
+new file mode 100644
+index 000000000..3a39d0e13
+--- /dev/null
++++ b/csrc/xpu/fused_moe.cpp
+@@ -0,0 +1,269 @@
++#include "utils.h"
++#include "base.hpp"
++
++using ST = at::ScalarType;
++
++#include <sycl/sycl.hpp>
++#include "xpu_types.h"
++#include <torch/extension.h>
++
++template <typename T>
++__inline__ T silu_xpu(const T& x) {
++  // x * sigmoid(x)
++  return (T)(((float)x) / (1.0f + sycl::exp((float)-x)));
++}
++
++template <typename scalar_t>
++void silu_and_mul_kernel(
++    scalar_t* __restrict__ out, // [..., d]
++    const scalar_t* __restrict__ input, // [..., 2, d]
++    const int d,
++    const sycl::nd_item<3>& item_ct1) {
++  const int64_t token_idx = item_ct1.get_group(2);
++  for (int64_t idx = item_ct1.get_local_id(2); idx < d;
++       idx += item_ct1.get_local_range(2)) {
++    const scalar_t x = input[token_idx * 2 * d + idx];
++    const scalar_t y = input[token_idx * 2 * d + d + idx];
++    out[token_idx * d + idx] = silu_xpu(x) * y;
++  }
++}
++
++template <typename scalar_t>
++void call_silu_and_mul_kernel(
++    int num_tokens,
++    int d,
++    const scalar_t* __restrict__ input,
++    scalar_t* __restrict__ output) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(d, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) {
++          silu_and_mul_kernel<sycl_t>(
++              (sycl_t*)output, (const sycl_t*)input, d, item_ct1);
++        });
++  });
++}
++
++void _silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
++  int num_tokens = input.numel() / input.size(-1);
++  int d = input.size(-1) / 2;
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_silu_and_mul_kernel", [&] {
++        call_silu_and_mul_kernel(
++            num_tokens,
++            d,
++            input.data_ptr<scalar_t>(),
++            out.data_ptr<scalar_t>());
++      });
++}
++
++template <typename IT, const int VS, const int GS, const int ES, const int QTYPE>
++static void moe_forward_kernel(
++    const void* input_ptr,
++    const int64_t* indexs,
++    const uint64_t* qweights,
++    void * output_ptr,
++    const int num_tokens,
++    const int state_size,
++    const int output_size,
++    at::Device device
++) {
++    static_assert(ES == 8 || ES == 16 || ES == 32);
++    assert(output_size % VS == 0);
++
++    const int nb = state_size / QK;
++    const int nsb = nb / SBS;
++
++    constexpr int BLOCK_SIZE = BLOCK_SIZES[QTYPE];
++    constexpr int SCALE_SIZE = SCALE_SIZES[QTYPE];
++
++    sycl::range<2> global_size(num_tokens, output_size / VS * GS);
++    sycl::range<2> local_size(1, GS);
++
++    auto cgf = [&](sycl::handler& handle) {
++        handle.parallel_for(
++            sycl::nd_range<2>(global_size, local_size),
++            [=](sycl::nd_item<2> item) SYCL_ESIMD_KERNEL {
++                slm_init<GS * VS * sizeof(float)>();
++
++                const int eid = item.get_global_id(0);
++                const int tid = item.get_local_id(1);
++                const int vid = item.get_group(1) * VS;
++
++                if (indexs[eid] >= 0) {
++                    const uint8_t* weight = (const uint8_t *)(qweights[indexs[eid]]);
++                    const uint8_t* scales = weight + (int64_t)output_size * nb * BLOCK_SIZE;
++                    const IT* input = static_cast<const IT *>(input_ptr) + eid * state_size;
++                    IT* output = static_cast<IT *>(output_ptr) + eid * output_size;
++
++                    const uint8_t * weight_base = weight + nb * BLOCK_SIZE * vid;
++                    const uint8_t * scale_base = scales + nb * SCALE_SIZE * vid;
++
++                    simd<IT, VS * ES> accvs{};
++
++                    for (int s = tid; s < nsb; s += GS) {
++                        simd<IT, SBS * QK> xvs = block_load<IT, SBS * QK>(input + s * SBS * QK);
++
++                        #pragma unroll
++                        for (int v = 0; v < VS; ++v) {
++                            simd<fp16, SBS * QK> yvs = load_qblocks<QTYPE>(
++                                weight_base + v * nb * BLOCK_SIZE + s * SBS * BLOCK_SIZE,
++                                scale_base + v * nb * SCALE_SIZE + s * SBS * SCALE_SIZE
++                            );
++
++                            #pragma unroll
++                            for (int i = 0; i < SBS * QK; i += ES) {
++                                accvs.template select<ES, 1>(v * ES) +=
++                                    xvs.template select<ES, 1>(i) *
++                                    yvs.template select<ES, 1>(i);
++                            }
++                        }
++                    }
++
++                    for (int b = nsb * SBS + tid; b < nb; b += GS) {
++                        simd<IT, QK> xv = block_load<IT, QK>(input + b * QK);
++
++                        #pragma unroll
++                        for (int v = 0; v < VS; ++v) {
++                            simd<fp16, QK> yv = load_qblock<QTYPE>(
++                                weight_base + v * nb * BLOCK_SIZE + b * BLOCK_SIZE,
++                                scale_base + v * nb * SCALE_SIZE + b * SCALE_SIZE
++                            );
++
++                            #pragma unroll
++                            for (int i = 0; i < QK; i += ES) {
++                                accvs.template select<ES, 1>(v * ES) +=
++                                    xv.template select<ES, 1>(i) *
++                                    yv.template select<ES, 1>(i);
++                            }
++                        }
++                    }
++
++                    simd<float, VS> accs;
++                    #pragma unroll
++                    for(int v = 0; v < VS; ++v) {
++                        accs[v] = sycl::ext::intel::esimd::detail::sum<float, IT, ES>(
++                            accvs.template select<ES, 1>(v * ES)
++                        );
++                    }
++
++                    slm_block_store<float, VS>(tid * VS * sizeof(float), accs);
++
++                    barrier();
++
++                    if (tid == 0) {
++                        #pragma unroll
++                        for (int i = 1; i < GS; ++i) {
++                            accs += slm_block_load<float, VS>(i * VS * sizeof(float));
++                        }
++
++                        block_store<IT, VS>(output + vid, accs);
++                    }
++                }
++
++                
++            }
++        );
++    };
++
++    utils::submit_kernel(cgf, device, "moe forward down kernel");
++}
++
++
++template <int QTYPE>
++static auto dispatch_moe_forward(ST scalar_t) {
++    switch (scalar_t) {
++        case ST::Float: return std::make_tuple(moe_forward_kernel<float, 4, 4, 16, QTYPE>);
++        case ST::Half: return std::make_tuple(moe_forward_kernel<fp16, 4, 4, 32, QTYPE>);
++        default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported");
++    }
++}
++
++
++torch::Tensor moe_forward(
++    torch::Tensor input,
++    torch::Tensor indexs,
++    torch::Tensor qweights_attr,
++    int64_t state_size,
++    int64_t output_size,
++    int64_t qtype
++) {
++    auto [func] = [&] () {
++        switch (qtype) {
++            case GGML_TYPE_Q4_0:
++                return dispatch_moe_forward<GGML_TYPE_Q4_0>(input.scalar_type());
++            case GGML_TYPE_Q4_0_WOQ:
++                return dispatch_moe_forward<GGML_TYPE_Q4_0_WOQ>(input.scalar_type());
++            case GGML_TYPE_FP8E5:
++                return dispatch_moe_forward<GGML_TYPE_FP8E5>(input.scalar_type());
++            default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype));
++        }
++    } ();
++
++    int64_t num_tokens = indexs.numel();
++
++    torch::Tensor output = torch::zeros({num_tokens, output_size},
++                                    torch::device(input.device()).dtype(input.dtype()));
++
++    func(
++        input.data_ptr(), indexs.data_ptr<int64_t>(),
++        qweights_attr.data_ptr<uint64_t>(), output.data_ptr(),
++        num_tokens, state_size, output_size, input.device()
++    );
++
++    return output;
++}
++
++
++torch::Tensor fused_moe_forward(
++    torch::Tensor input,
++    torch::Tensor indexs,
++    torch::Tensor qweights1_attr,
++    torch::Tensor qweights2_attr,
++    int64_t hidden_size,
++    int64_t intermediate_size,
++    int64_t qtype
++) {
++    auto [gmm_func] = [&] () {
++        switch (qtype) {
++            case GGML_TYPE_Q4_0:
++                return dispatch_moe_forward<GGML_TYPE_Q4_0>(input.scalar_type());
++            case GGML_TYPE_Q4_0_WOQ:
++                return dispatch_moe_forward<GGML_TYPE_Q4_0_WOQ>(input.scalar_type());
++            case GGML_TYPE_FP8E5:
++                return dispatch_moe_forward<GGML_TYPE_FP8E5>(input.scalar_type());
++            default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype));
++        }
++    } ();
++
++    int64_t num_tokens = indexs.numel();
++
++    torch::Tensor w1_output = torch::zeros({num_tokens, intermediate_size * 2},
++                                    torch::device(input.device()).dtype(input.dtype()));
++    
++    torch::Tensor tmp = torch::zeros({num_tokens, intermediate_size},
++                                    torch::device(input.device()).dtype(input.dtype()));
++    
++    torch::Tensor w2_output = torch::zeros({num_tokens, hidden_size},
++                                    torch::device(input.device()).dtype(input.dtype()));
++
++    gmm_func(
++        input.data_ptr(), indexs.data_ptr<int64_t>(),
++        qweights1_attr.data_ptr<uint64_t>(), w1_output.data_ptr(),
++        num_tokens, hidden_size, intermediate_size * 2, input.device()
++    );
++
++    _silu_and_mul(tmp, w1_output);
++
++    gmm_func(
++        tmp.data_ptr(), indexs.data_ptr<int64_t>(),
++        qweights2_attr.data_ptr<uint64_t>(), w2_output.data_ptr(),
++        num_tokens, intermediate_size, hidden_size, input.device()
++    );
++
++    return w2_output;
++}
+diff --git a/csrc/xpu/gemm_kernels_xpu.cpp b/csrc/xpu/gemm_kernels_xpu.cpp
+new file mode 100644
+index 000000000..d96aa5880
+--- /dev/null
++++ b/csrc/xpu/gemm_kernels_xpu.cpp
+@@ -0,0 +1,125 @@
++/*
++Adapted from https://github.com/mit-han-lab/llm-awq
++@article{lin2023awq,
++  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
++Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
++Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
++}
++ */
++
++#include <dpct/dpct.hpp>
++#include <sycl/sycl.hpp>
++#include <torch/extension.h>
++//#include <c10/cuda/CUDAGuard.h>
++#include "dequantize.h"
++#include "utils.h"
++#include "xpu_types.h"
++
++void awq_dequantize_impl(
++    int* __restrict__ input,
++    sycl::half* __restrict__ scaling_factors,
++    int* __restrict__ zeros,
++    sycl::half* __restrict__ output,
++    int G,
++    sycl::nd_item<3> item_ct1) {
++  int j_factors1 = 4;
++  int row_stride2 = 4;
++  int split_k_iters = 1;
++  sycl::half2 ZERO_HALF2{0, 0};
++  sycl::half input_shared[8];
++
++  int N = item_ct1.get_local_range(2) * item_ct1.get_group_range(2);
++  int col = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
++      item_ct1.get_local_id(2);
++  int row = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
++      item_ct1.get_local_id(1);
++  int index1 = 8 * col + 8 * row * N;
++  sycl::half* output_ptr2 = output + index1;
++
++  int index2 = col + row * N;
++  int* input_ptr2 = input + index2;
++
++  int index3 = col + (int)(row / G) * N;
++  int* zeros_ptr2 = zeros + index3;
++  int index4 = 8 * col + (int)(row / G) * N * 8;
++  sycl::half* scale_loaded = scaling_factors + index4;
++
++  uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
++  sycl::uint4 zero_loaded_u4 = vllm::awq::dequantize_s4_to_fp16x2(zeros_loaded);
++  // sycl::uint4 scale_loaded_u4 = *(sycl::uint4*)(scaling_factors_ptr2);
++  // int j = 0;
++
++  uint32_t input_loaded = *(uint32_t*)(input_ptr2);
++  sycl::uint4 input_loaded_fp16 =
++      vllm::awq::dequantize_s4_to_fp16x2(input_loaded);
++
++  sycl::half2* input_loaded_h2 = (sycl::half2*)(&input_loaded_fp16);
++  sycl::half2* zero_loaded_h2 = (sycl::half2*)(&zero_loaded_u4);
++  sycl::half2* scale_loaded_h2 = (sycl::half2*)scale_loaded;
++  for (int i = 0; i < 4; i++) {
++    input_loaded_h2[i] = sycl_half_sub2(input_loaded_h2[i], zero_loaded_h2[i]);
++    input_loaded_h2[i] =
++        sycl_half_fma2(input_loaded_h2[i], scale_loaded_h2[i], ZERO_HALF2);
++  }
++  *(sycl::uint4*)(input_shared) = input_loaded_fp16;
++
++  for (int i = 0; i < 8; ++i) {
++    *(output_ptr2 + i) = input_shared[i];
++  }
++}
++
++torch::Tensor awq_dequantize(
++    torch::Tensor _kernel,
++    torch::Tensor _scaling_factors,
++    torch::Tensor _zeros,
++    int split_k_iters,
++    int thx,
++    int thy) {
++  int in_c = _kernel.size(0);
++  int qout_c = _kernel.size(1);
++  int out_c = qout_c * 8;
++  int G = in_c / _scaling_factors.size(0);
++
++  int x_thread = thx;
++  int y_thread = thy;
++
++  int x_blocks = 1;
++  int y_blocks = 1;
++  if (thx == 0) {
++    x_thread = qout_c;
++  }
++  if (thy == 0) {
++    y_thread = in_c;
++  }
++  if (thx == 0 && thy == 0) {
++    x_thread = 8;
++    y_thread = 8;
++    x_blocks = (int)(qout_c / 8);
++    y_blocks = (int)(in_c / 8);
++  }
++
++  auto options = torch::TensorOptions()
++                     .dtype(_scaling_factors.dtype())
++                     .device(_scaling_factors.device());
++  at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
++  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
++  auto de_kernel =
++      reinterpret_cast<sycl::half*>(_de_kernel.data_ptr<at::Half>());
++  auto scaling_factors =
++      reinterpret_cast<sycl::half*>(_scaling_factors.data_ptr<at::Half>());
++  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
++
++  sycl::range<3> num_blocks(1, y_blocks, x_blocks);
++  sycl::range<3> threads_per_block(1, y_thread, x_thread);
++  auto& queue = vllm::xpu::vllmGetQueue();
++
++  queue.submit([&](sycl::handler& cgh) {
++    cgh.parallel_for(
++        sycl::nd_range<3>(num_blocks * threads_per_block, threads_per_block),
++        [=](sycl::nd_item<3> item_ct1) {
++          awq_dequantize_impl(
++              kernel, scaling_factors, zeros, de_kernel, G, item_ct1);
++        });
++  });
++  return _de_kernel;
++}
+\ No newline at end of file
+diff --git a/csrc/xpu/kv.h b/csrc/xpu/kv.h
+new file mode 100644
+index 000000000..9616ad7ef
+--- /dev/null
++++ b/csrc/xpu/kv.h
+@@ -0,0 +1,76 @@
++#pragma once
++
++#include <torch/extension.h>
++#include <ext/intel/esimd.hpp>
++
++using fp16 = sycl::half;
++
++constexpr uint8_t FP16_EXP_OFFSET = 15;
++constexpr uint8_t K_EXP_OFFSET = 9;
++constexpr uint8_t V_EXP_OFFSET = 12;
++constexpr uint8_t K_OFFSET = (FP16_EXP_OFFSET - K_EXP_OFFSET) << 3;
++constexpr uint8_t V_OFFSET = (FP16_EXP_OFFSET - V_EXP_OFFSET) << 3;
++constexpr uint16_t K_MAX =
++    (uint16_t)0x3FC0 + ((uint16_t)(FP16_EXP_OFFSET - K_EXP_OFFSET) << 10);
++constexpr uint16_t K_MIN =
++    (uint16_t)0x0040 + ((uint16_t)(FP16_EXP_OFFSET - K_EXP_OFFSET) << 10);
++constexpr uint16_t V_MAX =
++    (uint16_t)0x3FC0 + ((uint16_t)(FP16_EXP_OFFSET - V_EXP_OFFSET) << 10);
++constexpr uint16_t V_MIN =
++    (uint16_t)0x0040 + ((uint16_t)(FP16_EXP_OFFSET - V_EXP_OFFSET) << 10);
++
++template <const int HD>
++ESIMD_INLINE __ESIMD_NS::simd<uint8_t, HD> quantize_key_row(
++    __ESIMD_NS::simd<fp16, HD> key_row) {
++  const __ESIMD_NS::simd<fp16, HD> kmax = sycl::bit_cast<fp16, uint16_t>(K_MAX);
++  const __ESIMD_NS::simd<fp16, HD> kmin = sycl::bit_cast<fp16, uint16_t>(K_MIN);
++  __ESIMD_NS::simd<fp16, HD> key =
++      __ESIMD_NS::max(__ESIMD_NS::min(__ESIMD_NS::abs(key_row), kmax), kmin);
++  key.template bit_cast_view<uint16_t>() <<= 1;
++  __ESIMD_NS::simd<uint8_t, HD> sign =
++      key_row.template bit_cast_view<uint8_t>().template select<HD, 2>(1) &
++      (uint8_t)0x80;
++  return (key.template bit_cast_view<uint8_t>().template select<HD, 2>(1) -
++          K_OFFSET) |
++         sign;
++}
++
++template <const int HD>
++ESIMD_INLINE __ESIMD_NS::simd<uint8_t, HD> quantize_value_row(
++    __ESIMD_NS::simd<fp16, HD> value_row) {
++  const __ESIMD_NS::simd<fp16, HD> vmax = sycl::bit_cast<fp16, uint16_t>(V_MAX);
++  const __ESIMD_NS::simd<fp16, HD> vmin = sycl::bit_cast<fp16, uint16_t>(V_MIN);
++  __ESIMD_NS::simd<fp16, HD> value =
++      __ESIMD_NS::max(__ESIMD_NS::min(__ESIMD_NS::abs(value_row), vmax), vmin);
++  value.template bit_cast_view<uint16_t>() <<= 1;
++  __ESIMD_NS::simd<uint8_t, HD> sign =
++      value_row.template bit_cast_view<uint8_t>().template select<HD, 2>(1) &
++      (uint8_t)0x80;
++  return (value.template bit_cast_view<uint8_t>().template select<HD, 2>(1) -
++          V_OFFSET) |
++         sign;
++}
++
++template <const int HD>
++ESIMD_INLINE __ESIMD_NS::simd<fp16, HD> dequantize_key_row(
++    const __ESIMD_NS::simd<uint8_t, HD>& key_row) {
++  __ESIMD_NS::simd<uint16_t, HD> result = 0x80;
++  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) =
++      (key_row & (uint8_t)0x7F) + K_OFFSET;
++  result >>= 1;
++  __ESIMD_NS::simd<uint8_t, HD> sign = key_row & (uint8_t)0x80;
++  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) |= sign;
++  return result.template bit_cast_view<fp16>();
++}
++
++template <const int HD>
++ESIMD_INLINE __ESIMD_NS::simd<fp16, HD> dequantize_value_row(
++    const __ESIMD_NS::simd<uint8_t, HD>& value_row) {
++  __ESIMD_NS::simd<uint16_t, HD> result = 0x80;
++  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) =
++      (value_row & (uint8_t)0x7F) + V_OFFSET;
++  result >>= 1;
++  __ESIMD_NS::simd<uint8_t, HD> sign = value_row & (uint8_t)0x80;
++  result.template bit_cast_view<uint8_t>().template select<HD, 2>(1) |= sign;
++  return result.template bit_cast_view<fp16>();
++}
+\ No newline at end of file
+diff --git a/csrc/xpu/layernorm_xpu.cpp b/csrc/xpu/layernorm_xpu.cpp
+new file mode 100644
+index 000000000..9a6a2af0a
+--- /dev/null
++++ b/csrc/xpu/layernorm_xpu.cpp
+@@ -0,0 +1,188 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++#include <dpct/dpct.hpp>
++
++#include <torch/extension.h>
++#include <algorithm>
++#include "utils.h"
++#include "xpu_types.h"
++#include "reduction_utils.h"
++
++namespace vllm {
++
++template <typename scalar_t>
++void rms_norm_kernel(
++    scalar_t* __restrict__ out, // [..., hidden_size]
++    const scalar_t* __restrict__ input, // [..., hidden_size]
++    const scalar_t* __restrict__ weight, // [hidden_size]
++    const float epsilon,
++    const int num_tokens,
++    const int hidden_size,
++    const sycl::nd_item<3>& item_ct1,
++    float* s_variance,
++    float* shared_vals) {
++  float variance = 0.0f;
++
++  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
++       idx += item_ct1.get_local_range(2)) {
++    const float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
++    variance += x * x;
++  }
++
++  variance = blockReduceSum<float>(variance, item_ct1, shared_vals);
++  if (item_ct1.get_local_id(2) == 0) {
++    *s_variance = sycl::rsqrt(variance / hidden_size + epsilon);
++  }
++
++  // item_ct1.barrier();
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
++       idx += item_ct1.get_local_range(2)) {
++    float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
++    out[item_ct1.get_group(2) * hidden_size + idx] =
++        ((scalar_t)(x * (*s_variance))) * weight[idx];
++  }
++}
++
++template <typename scalar_t>
++void call_rms_norm_kernel(
++    torch::Tensor& out,
++    torch::Tensor& input,
++    torch::Tensor& weight,
++    float epsilon) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int hidden_size = input.size(-1);
++  int num_tokens = input.numel() / hidden_size;
++  auto out_ptr = out.data_ptr<scalar_t>();
++  auto input_ptr = input.data_ptr<scalar_t>();
++  auto weight_ptr = weight.data_ptr<scalar_t>();
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(hidden_size, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    sycl::local_accessor<float, 1> shared_vals( sycl::range<1>(32), cgh);
++    sycl::local_accessor<float, 1> s_variance( sycl::range<1>(1), cgh);
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block),
++        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
++          rms_norm_kernel<sycl_t>(
++              (sycl_t*)out_ptr,
++              (const sycl_t*)input_ptr,
++              (const sycl_t*)weight_ptr,
++              epsilon,
++              num_tokens,
++              hidden_size,
++              item_ct1,
++              s_variance.get_pointer(),
++              shared_vals.get_pointer());
++        });
++  });
++}
++
++
++template <typename scalar_t>
++void fused_add_rms_norm_kernel(
++    scalar_t* __restrict__ input,   // [..., hidden_size]
++    scalar_t* __restrict__ residual,        // [..., hidden_size]
++    const scalar_t* __restrict__ weight, // [hidden_size]
++    const float epsilon,
++    const int num_tokens,
++    const int hidden_size,
++    const sycl::nd_item<3>& item_ct1,
++    float* s_variance,
++    float* shared_vals) {
++  float variance = 0.0f;
++
++  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
++       idx += item_ct1.get_local_range(2)) {
++    float x = (float)input[item_ct1.get_group(2) * hidden_size + idx];
++    x+=(float)residual[item_ct1.get_group(2) * hidden_size + idx];
++    variance += x * x;
++    residual[item_ct1.get_group(2) * hidden_size + idx] = (scalar_t)x;
++  }
++
++  variance = blockReduceSum<float>(variance, item_ct1, shared_vals);
++  if (item_ct1.get_local_id(2) == 0) {
++    *s_variance = sycl::rsqrt(variance / hidden_size + epsilon);
++  }
++
++  // item_ct1.barrier();
++  item_ct1.barrier(sycl::access::fence_space::local_space);
++
++  for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
++       idx += item_ct1.get_local_range(2)) {
++    float x = (float)residual[item_ct1.get_group(2) * hidden_size + idx];
++    input[item_ct1.get_group(2) * hidden_size + idx] =
++        ((scalar_t)(x * (*s_variance))) * weight[idx];
++  }
++}
++
++template <typename scalar_t>
++void call_fused_add_rms_norm_kernel(
++    torch::Tensor& input,
++    torch::Tensor& residual,
++    torch::Tensor& weight,
++    float epsilon){
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  int hidden_size = input.size(-1);
++  int num_tokens = input.numel() / hidden_size;
++  auto input_ptr = input.data_ptr<scalar_t>();
++  auto residual_ptr = residual.data_ptr<scalar_t>();
++  auto weight_ptr = weight.data_ptr<scalar_t>();
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(hidden_size, 1024));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  queue.submit([&](sycl::handler& cgh) {
++    sycl::local_accessor<float, 1> shared_vals( sycl::range<1>(32), cgh);
++    sycl::local_accessor<float, 1> s_variance( sycl::range<1>(1), cgh);
++    cgh.parallel_for(
++        sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(32)]] {
++          fused_add_rms_norm_kernel<sycl_t>(
++              (sycl_t*)input_ptr,
++              (sycl_t*)residual_ptr,
++              (const sycl_t*)weight_ptr,
++              epsilon,
++              num_tokens,
++              hidden_size,
++              item_ct1,
++              s_variance.get_pointer(),
++              shared_vals.get_pointer());
++        });
++  });
++}
++
++} // namespace vllm
++
++void rms_norm(
++    torch::Tensor& out,
++    torch::Tensor& input,
++    torch::Tensor& weight,
++    float epsilon) {
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_rms_norm_kernel", [&] {
++        vllm::call_rms_norm_kernel<scalar_t>(out, input, weight, epsilon);
++      });
++}
++
++void fused_add_rms_norm(
++    torch::Tensor& input,
++    torch::Tensor& residual,
++    torch::Tensor& weight,
++    float epsilon) {
++  int hidden_size = input.size(-1);
++  int num_tokens = input.numel() / hidden_size;
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "call_fused_add_rms_norm_kernel", [&] {
++        vllm::call_fused_add_rms_norm_kernel<scalar_t>(
++            input,
++            residual,
++            weight,
++               epsilon);
++      });
++}
++
+diff --git a/csrc/xpu/pos_encoding_xpu.cpp b/csrc/xpu/pos_encoding_xpu.cpp
+new file mode 100644
+index 000000000..3232cacbc
+--- /dev/null
++++ b/csrc/xpu/pos_encoding_xpu.cpp
+@@ -0,0 +1,333 @@
++// clang-format off
++#ifdef VLLM_DEV
++#undef __SYCL_DEVICE_ONLY__
++#endif
++#include <sycl/sycl.hpp>
++// clang-format on
++#include "xpu_types.h"
++
++#include <torch/extension.h>
++#include "utils.h"
++
++template <typename scalar_t, bool IS_NEOX>
++inline void apply_rotary_embedding(
++    scalar_t* __restrict__ arr,
++    const scalar_t* __restrict__ cos_ptr,
++    const scalar_t* __restrict__ sin_ptr,
++    int rot_offset,
++    int embed_dim) {
++  int x_index, y_index;
++  scalar_t cos, sin;
++  if (IS_NEOX) {
++    // GPT-NeoX style rotary embedding.
++    x_index = rot_offset;
++    y_index = embed_dim + rot_offset;
++    cos = VLLM_LDG(cos_ptr + x_index);
++    sin = VLLM_LDG(sin_ptr + x_index);
++  } else {
++    // GPT-J style rotary embedding.
++    x_index = 2 * rot_offset;
++    y_index = 2 * rot_offset + 1;
++    cos = VLLM_LDG(cos_ptr + x_index / 2);
++    sin = VLLM_LDG(sin_ptr + x_index / 2);
++  }
++
++  const scalar_t x = arr[x_index];
++  const scalar_t y = arr[y_index];
++  arr[x_index] = x * cos - y * sin;
++  arr[y_index] = y * cos + x * sin;
++}
++
++template <typename scalar_t, bool IS_NEOX>
++void rotary_embedding_kernel(
++    const int64_t* __restrict__ positions, // [batch_size, seq_len] or
++                                           // [num_tokens]
++    scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size]
++                                  // or [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
++                                // head_size] or [num_tokens, num_kv_heads,
++                                // head_size]
++    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
++                                                // 2]
++    const int rot_dim,
++    const int query_stride,
++    const int key_stride,
++    const int num_heads,
++    const int num_kv_heads,
++    const int head_size,
++    const sycl::nd_item<3>& item_ct1) {
++  // Each thread block is responsible for one token.
++  const int token_idx = item_ct1.get_group(2);
++  int64_t pos = positions[token_idx];
++  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
++
++  const int embed_dim = rot_dim / 2;
++  const scalar_t* cos_ptr = cache_ptr;
++  const scalar_t* sin_ptr = cache_ptr + embed_dim;
++
++  const int nq = num_heads * embed_dim;
++  for (int i = item_ct1.get_local_id(2); i < nq;
++       i += item_ct1.get_local_range(2)) {
++    const int head_idx = i / embed_dim;
++    const int token_head = token_idx * query_stride + head_idx * head_size;
++    const int rot_offset = i % embed_dim;
++    apply_rotary_embedding<scalar_t, IS_NEOX>(
++        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
++  }
++
++  const int nk = num_kv_heads * embed_dim;
++  for (int i = item_ct1.get_local_id(2); i < nk;
++       i += item_ct1.get_local_range(2)) {
++    const int head_idx = i / embed_dim;
++    const int token_head = token_idx * key_stride + head_idx * head_size;
++    const int rot_offset = i % embed_dim;
++    apply_rotary_embedding<scalar_t, IS_NEOX>(
++        key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
++  }
++}
++
++template <typename scalar_t, bool IS_NEOX>
++void batched_rotary_embedding_kernel(
++    const int64_t* __restrict__ positions, // [batch_size, seq_len] or
++                                           // [num_tokens]
++    scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size]
++                                  // or [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
++                                // head_size] or [num_tokens, num_kv_heads,
++                                // head_size]
++    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
++                                                // 2]
++    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
++    const int rot_dim,
++    const int query_stride,
++    const int key_stride,
++    const int num_heads,
++    const int num_kv_heads,
++    const int head_size,
++    const sycl::nd_item<3>& item_ct1) {
++  // Each thread block is responsible for one token.
++  const int token_idx = item_ct1.get_group(2);
++  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
++  int64_t pos = positions[token_idx];
++  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
++
++  const int embed_dim = rot_dim / 2;
++  const scalar_t* cos_ptr = cache_ptr;
++  const scalar_t* sin_ptr = cache_ptr + embed_dim;
++
++  const int nq = num_heads * embed_dim;
++  for (int i = item_ct1.get_local_id(2); i < nq;
++       i += item_ct1.get_local_range(2)) {
++    const int head_idx = i / embed_dim;
++    const int token_head = token_idx * query_stride + head_idx * head_size;
++    const int rot_offset = i % embed_dim;
++    apply_rotary_embedding<scalar_t, IS_NEOX>(
++        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
++  }
++
++  const int nk = num_kv_heads * embed_dim;
++  for (int i = item_ct1.get_local_id(2); i < nk;
++       i += item_ct1.get_local_range(2)) {
++    const int head_idx = i / embed_dim;
++    const int token_head = token_idx * key_stride + head_idx * head_size;
++    const int rot_offset = i % embed_dim;
++    apply_rotary_embedding<scalar_t, IS_NEOX>(
++        key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
++  }
++}
++
++template <typename scalar_t>
++void call_rotary_embedding_kernel(
++    const int64_t* __restrict__ positions, // [num_tokens]
++    scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size]
++    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
++                                                // 2]
++    const int rot_dim,
++    const int query_stride,
++    const int key_stride,
++    const int num_heads,
++    const int num_kv_heads,
++    const int head_size,
++    const int num_tokens,
++    const int sin_cos_dim,
++    bool is_neox) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(num_heads * rot_dim / 2, 512));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  if (is_neox) {
++    queue.submit([&](sycl::handler& cgh) {
++      cgh.parallel_for(
++          sycl::nd_range<3>(grid * block, block),
++          [=](sycl::nd_item<3> item_ct1) {
++            rotary_embedding_kernel<sycl_t, true>(
++                positions,
++                (sycl_t* __restrict__)query,
++                (sycl_t* __restrict__)key,
++                (const sycl_t* __restrict__)cos_sin_cache,
++                rot_dim,
++                query_stride,
++                key_stride,
++                num_heads,
++                num_kv_heads,
++                head_size,
++                item_ct1);
++          });
++    });
++  } else {
++    queue.submit([&](sycl::handler& cgh) {
++      cgh.parallel_for(
++          sycl::nd_range<3>(grid * block, block),
++          [=](sycl::nd_item<3> item_ct1) {
++            rotary_embedding_kernel<sycl_t, false>(
++                positions,
++                (sycl_t* __restrict__)query,
++                (sycl_t* __restrict__)key,
++                (const sycl_t* __restrict__)cos_sin_cache,
++                rot_dim,
++                query_stride,
++                key_stride,
++                num_heads,
++                num_kv_heads,
++                head_size,
++                item_ct1);
++          });
++    });
++  }
++}
++
++template <typename scalar_t>
++void call_batched_rotary_embedding_kernel(
++    const int64_t* __restrict__ positions, // [num_tokens]
++    scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size]
++    scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size]
++    const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
++                                                // 2]
++    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
++    const int rot_dim,
++    const int query_stride,
++    const int key_stride,
++    const int num_heads,
++    const int num_kv_heads,
++    const int head_size,
++    const int num_tokens,
++    const int sin_cos_dim,
++    bool is_neox) {
++  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;
++  sycl::range<3> grid(1, 1, num_tokens);
++  sycl::range<3> block(1, 1, std::min(num_heads * rot_dim / 2, 512));
++  auto& queue = vllm::xpu::vllmGetQueue();
++  if (is_neox) {
++    queue.submit([&](sycl::handler& cgh) {
++      cgh.parallel_for(
++          sycl::nd_range<3>(grid * block, block),
++          [=](sycl::nd_item<3> item_ct1) {
++            batched_rotary_embedding_kernel<sycl_t, true>(
++                positions,
++                (sycl_t* __restrict__)query,
++                (sycl_t* __restrict__)key,
++                (const sycl_t* __restrict__)cos_sin_cache,
++                cos_sin_cache_offsets,
++                rot_dim,
++                query_stride,
++                key_stride,
++                num_heads,
++                num_kv_heads,
++                head_size,
++                item_ct1);
++          });
++    });
++  } else {
++    queue.submit([&](sycl::handler& cgh) {
++      cgh.parallel_for(
++          sycl::nd_range<3>(grid * block, block),
++          [=](sycl::nd_item<3> item_ct1) {
++            batched_rotary_embedding_kernel<sycl_t, false>(
++                positions,
++                (sycl_t* __restrict__)query,
++                (sycl_t* __restrict__)key,
++                (const sycl_t* __restrict__)cos_sin_cache,
++                cos_sin_cache_offsets,
++                rot_dim,
++                query_stride,
++                key_stride,
++                num_heads,
++                num_kv_heads,
++                head_size,
++                item_ct1);
++          });
++    });
++  }
++}
++
++void rotary_embedding(
++    torch::Tensor& positions,
++    torch::Tensor& query,
++    torch::Tensor& key,
++    int head_size,
++    torch::Tensor& cos_sin_cache,
++    bool is_neox) {
++
++  int num_tokens = query.numel() / query.size(-1);
++  int rot_dim = cos_sin_cache.size(1);
++  int num_heads = query.size(-1) / head_size;
++  int num_kv_heads = key.size(-1) / head_size;
++  int key_stride = key.stride(-2);
++  int query_stride = query.stride(-2);
++  int cos_sin_dim = cos_sin_cache.size(0);
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++      query.scalar_type(), "call_rotary_embedding_kernel", [&] {
++        call_rotary_embedding_kernel<scalar_t>(
++            positions.data_ptr<int64_t>(),
++            query.data_ptr<scalar_t>(),
++            key.data_ptr<scalar_t>(),
++            cos_sin_cache.data_ptr<scalar_t>(),
++            rot_dim,
++            query_stride,
++            key_stride,
++            num_heads,
++            num_kv_heads,
++            head_size,
++            num_tokens,
++            cos_sin_dim,
++            is_neox);
++      });
++}
++
++void batched_rotary_embedding(
++  torch::Tensor& positions,
++  torch::Tensor& query,
++  torch::Tensor& key,
++  int head_size,
++  torch::Tensor& cos_sin_cache,
++  bool is_neox,
++  int rot_dim,
++  torch::Tensor& cos_sin_cache_offsets) {
++  int64_t num_tokens = cos_sin_cache_offsets.size(0);
++  int num_heads = query.size(-1) / head_size;
++  int num_kv_heads = key.size(-1) / head_size;
++  int key_stride = key.stride(-2);
++  int query_stride = query.stride(-2);
++  int cos_sin_dim = cos_sin_cache.size(0);
++
++  VLLM_XPU_DISPATCH_FLOATING_TYPES(
++    query.scalar_type(), "call_batched_rotary_embedding_kernel", [&] {
++      call_batched_rotary_embedding_kernel<scalar_t>(
++          positions.data_ptr<int64_t>(),
++          query.data_ptr<scalar_t>(),
++          key.data_ptr<scalar_t>(),
++          cos_sin_cache.data_ptr<scalar_t>(),
++          cos_sin_cache_offsets.data_ptr<int64_t>(),
++          rot_dim,
++          query_stride,
++          key_stride,
++          num_heads,
++          num_kv_heads,
++          head_size,
++          num_tokens,
++          cos_sin_dim,
++          is_neox);
++    });
++}
+\ No newline at end of file
+diff --git a/csrc/xpu/pybind.cpp b/csrc/xpu/pybind.cpp
+new file mode 100644
+index 000000000..bf9e94612
+--- /dev/null
++++ b/csrc/xpu/pybind.cpp
+@@ -0,0 +1,112 @@
++// #include "cache.h"
++#include "xpu_ops.h"
++#include <torch/extension.h>
++
++PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
++  // vLLM custom ops
++  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
++
++  // Attention ops
++  ops.def(
++    "paged_attention_v1",
++    &paged_attention_v1,
++    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
++  ops.def(
++    "paged_attention_v2",
++    &paged_attention_v2,
++    "PagedAttention V2.");
++
++  ops.def("context_attention_forward_v1", &context_attention_forward_v1,
++          "Context attention forward_v1");
++
++  ops.def("context_attention_forward_v2", &context_attention_forward_v2,
++          "Context attention forward_v2");
++
++  ops.def(
++    "paged_attention_gqa",
++    &paged_attention_gqa,
++    "PagedAttention GQA.");
++
++  ops.def("paged_attention_gqa_fp8", &paged_attention_gqa_fp8, "PagedAttention GQA fp8.");
++
++  // Activation ops
++  ops.def(
++    "silu_and_mul",
++    &silu_and_mul,
++    "Activation function used in SwiGLU.");
++  ops.def(
++    "gelu_and_mul",
++    &gelu_and_mul,
++    "Activation function used in GeGLU with `none` approximation.");
++  ops.def(
++    "gelu_tanh_and_mul",
++    &gelu_tanh_and_mul,
++    "Activation function used in GeGLU with `tanh` approximation.");
++  ops.def(
++    "gelu_new",
++    &gelu_new,
++    "GELU implementation used in GPT-2.");
++  ops.def(
++    "gelu_fast",
++    &gelu_fast,
++    "Approximate GELU implementation.");
++
++  // Layernorm
++  ops.def(
++    "rms_norm",
++    &rms_norm,
++    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
++
++  ops.def(
++    "fused_add_rms_norm",
++    &fused_add_rms_norm,
++    "In-place fused Add and RMS Normalization");
++
++  // Rotary embedding
++  ops.def(
++    "rotary_embedding",
++    &rotary_embedding,
++    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
++
++  // Cache ops
++  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
++  cache_ops.def(
++    "swap_blocks",
++    &swap_blocks,
++    "Swap in (out) the cache blocks from src to dst");
++  cache_ops.def(
++    "copy_blocks",
++    &copy_blocks,
++    "Copy the cache blocks from src to dst");
++  cache_ops.def(
++    "reshape_and_cache",
++    &reshape_and_cache,
++    "Reshape the key and value tensors and cache them");
++  cache_ops.def(
++    "reshape_and_cache_ipexllm",
++    &reshape_and_cache_ipexllm,
++    "Reshape the key and value tensors and cache them for ipex_llm");
++
++  cache_ops.def(
++    "reshape_and_cache_ipexllm_fp8",
++    &reshape_and_cache_ipexllm_fp8,
++    "Reshape the key and value tensors and cache them for ipex_llm with fp8");
++
++  // Quant
++  ops.def(
++    "awq_dequantize",
++    &awq_dequantize,
++    "dequant method for awq");
++
++
++  ops.def(
++    "moe_forward",
++    &moe_forward,
++    "PagedAttention GQA.");
++
++  ops.def(
++    "fused_moe_forward",
++    &fused_moe_forward,
++    "PagedAttention GQA.");
++
++}
+diff --git a/csrc/xpu/reduction_utils.h b/csrc/xpu/reduction_utils.h
+new file mode 100644
+index 000000000..93c64d759
+--- /dev/null
++++ b/csrc/xpu/reduction_utils.h
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2023, The vLLM team.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#pragma once
++
++#include <dpct/dpct.hpp>
++#include <stdint.h>
++#include <sycl/sycl.hpp>
++
++namespace vllm {
++
++template <typename T>
++__inline__ T warpReduceSum(T val, const sycl::nd_item<3>& item_ct1) {
++#pragma unroll
++  for (int mask = 16; mask > 0; mask >>= 1)
++    val += dpct::permute_sub_group_by_xor(
++        item_ct1.get_sub_group(), val, mask, 32);
++  return val;
++}
++
++/* Calculate the sum of all elements in a block */
++template<typename T>
++__inline__ T blockReduceSum(T val, const sycl::nd_item<3> &item_ct1, T *shared) {
++
++  int lane = item_ct1.get_local_id(2) & 0x1f;
++  int wid = item_ct1.get_local_id(2) >> 5;
++
++  val = warpReduceSum<T>(val, item_ct1);
++
++  if (lane == 0) {
++    shared[wid] = val;
++  }
++  item_ct1.barrier();
++
++  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
++  // blockDim.x is not divided by 32
++  val = (item_ct1.get_local_id(2) < (item_ct1.get_local_range(2) / 32.f))
++            ? shared[lane]
++            : (T)(0.0f);
++  val = warpReduceSum<T>(val, item_ct1);
++  return val;
++}
++
++} // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/xpu/utils.cpp b/csrc/xpu/utils.cpp
+new file mode 100644
+index 000000000..5f613af55
+--- /dev/null
++++ b/csrc/xpu/utils.cpp
+@@ -0,0 +1,34 @@
++#include "utils.h"
++#include <sycl/ext/intel/math.hpp>
++
++sycl::half sycl_half_mul(sycl::half a, sycl::half b) {
++  return sycl::ext::intel::math::hmul(a, b);
++}
++sycl::half sycl_half_add(sycl::half a, sycl::half b) {
++  return sycl::ext::intel::math::hadd(a, b);
++}
++sycl::half sycl_half_sub(sycl::half a, sycl::half b) {
++  return sycl::ext::intel::math::hsub(a, b);
++}
++sycl::half sycl_half_fma(sycl::half a, sycl::half b, sycl::half c) {
++  return sycl::ext::intel::math::hfma(a, b, c);
++}
++
++sycl::half2 sycl_half_mul2(sycl::half2 a, sycl::half2 b) {
++  return sycl::ext::intel::math::hmul2(a, b);
++}
++sycl::half2 sycl_half_add2(sycl::half2 a, sycl::half2 b) {
++  return sycl::ext::intel::math::hadd2(a, b);
++}
++sycl::half2 sycl_half_sub2(sycl::half2 a, sycl::half2 b) {
++  return sycl::ext::intel::math::hsub2(a, b);
++}
++
++sycl::half2 sycl_half_fma2(sycl::half2 a, sycl::half2 b, sycl::half2 c) {
++  return sycl::ext::intel::math::hfma2(a, b, c);
++}
++
++int get_max_shared_memory_per_block_device_attribute(int device_id) {
++  const sycl::device& device = vllm::xpu::vllmGetQueue().get_device();
++  return device.get_info<sycl::info::device::local_mem_size>();
++}
+diff --git a/csrc/xpu/utils.h b/csrc/xpu/utils.h
+new file mode 100644
+index 000000000..fa3ead51c
+--- /dev/null
++++ b/csrc/xpu/utils.h
+@@ -0,0 +1,82 @@
++#pragma once
++
++#include <sycl/sycl.hpp>
++#include <functional>
++#include <memory>
++// #include <ipex.h>
++#include <ATen/ATen.h>
++#include <torch/torch.h>
++
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++#include <c10/xpu/XPUStream.h>
++#endif
++
++
++#define VLLM_LDG(arg) *(arg)
++namespace vllm {
++namespace xpu {
++
++static inline sycl::queue& vllmGetQueue() {
++  auto device_type = c10::DeviceType::XPU;
++  c10::impl::VirtualGuardImpl impl(device_type);
++  c10::Stream c10_stream = impl.getStream(c10::Device(device_type));
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++  return at::xpu::XPUStream(c10_stream).queue();
++#else
++  return ::xpu::get_queue_from_stream(c10_stream);
++#endif
++}
++template <typename T>
++struct SyclTypeTrait{
++  using Type = T;
++};
++
++template <>
++struct SyclTypeTrait<c10::Half>{
++  using Type = sycl::half;
++};
++
++template <>
++struct SyclTypeTrait<c10::BFloat16>{
++  using Type = sycl::ext::oneapi::bfloat16;
++};
++
++
++} // namespace xpu
++
++} // namespace vllm
++
++SYCL_EXTERNAL sycl::half sycl_half_mul(sycl::half a, sycl::half b);
++SYCL_EXTERNAL sycl::half sycl_half_add(sycl::half a, sycl::half b);
++SYCL_EXTERNAL sycl::half sycl_half_sub(sycl::half a, sycl::half b);
++SYCL_EXTERNAL sycl::half sycl_half_fma(sycl::half a, sycl::half b, sycl::half c);
++
++SYCL_EXTERNAL sycl::half2 sycl_half_mul2(sycl::half2 a, sycl::half2 b);
++SYCL_EXTERNAL sycl::half2 sycl_half_add2(sycl::half2 a, sycl::half2 b);
++SYCL_EXTERNAL sycl::half2 sycl_half_sub2(sycl::half2 a, sycl::half2 b);
++SYCL_EXTERNAL sycl::half2 sycl_half_fma2(sycl::half2 a, sycl::half2 b, sycl::half2 c);
++
++int get_max_shared_memory_per_block_device_attribute(int device_id);
++
++namespace utils {
++    static inline sycl::queue& get_queue(const at::Device& device) {
++        c10::impl::VirtualGuardImpl impl(device.type());
++        c10::Stream c10_stream = impl.getStream(c10::Device(device));
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++        return at::xpu::XPUStream(c10_stream).queue();
++#else
++        return ::xpu::get_queue_from_stream(c10_stream);
++#endif
++    }
++
++    static inline sycl::event submit_kernel(std::function<void(sycl::handler&)> kernel, const at::Device& device, const char * desc) {
++        sycl::queue& queue = get_queue(device);
++        sycl::event event = queue.submit(kernel);
++#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3
++        // xpu::profiler_record(desc, event);
++#else
++        ::xpu::profiler_record(desc, event);
++#endif
++        return event;
++    }
++}
+diff --git a/csrc/xpu/xpu_ops.h b/csrc/xpu/xpu_ops.h
+new file mode 100644
+index 000000000..603d4f23d
+--- /dev/null
++++ b/csrc/xpu/xpu_ops.h
+@@ -0,0 +1,194 @@
++#pragma once
++#include <torch/extension.h>
++
++void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
++                          torch::Tensor &key, int head_size,
++                          torch::Tensor &cos_sin_cache, bool is_neox);
++void batched_rotary_embedding(
++  torch::Tensor& positions,
++  torch::Tensor& query,
++  torch::Tensor& key,
++  int head_size,
++  torch::Tensor& cos_sin_cache,
++  bool is_neox,
++  int rot_dim,
++  torch::Tensor& cos_sin_cache_offsets);
++
++void silu_and_mul(torch::Tensor &out, torch::Tensor &input);
++void gelu_and_mul(torch::Tensor &out, torch::Tensor &input);
++
++void gelu_new(torch::Tensor &out, torch::Tensor &input);
++
++void gelu_fast(torch::Tensor &out, torch::Tensor &input);
++
++
++void gelu_tanh_and_mul(
++  torch::Tensor& out,
++  torch::Tensor& input);
++
++void paged_attention_v1(
++    torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
++    torch::Tensor &value_cache, int num_kv_heads, float scale,
++    torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
++    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes,
++    const std::string& kv_cache_dtype, const float kv_scale, const float attn_logit_softcapping);
++
++void paged_attention_v2(
++    torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
++    torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
++    torch::Tensor &value_cache, int num_kv_heads, float scale,
++    torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
++    int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes,
++    const std::string& kv_cache_dtype, const float kv_scale, const float attn_logit_softcapping);
++
++torch::Tensor context_attention_forward_v1(
++    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
++    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor block_tables, torch::Tensor query_start_loc,
++    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
++    int max_context_length);
++
++torch::Tensor context_attention_forward_v2(
++    torch::Tensor query,  // [num_tokens, num_kv_head, head_dim]
++    torch::Tensor key,    // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor value,  // [num_tokens, num_kv_heads * head_size]
++    torch::Tensor block_tables, torch::Tensor query_start_loc,
++    torch::Tensor seq_lens, torch::Tensor context_lens, int max_input_length,
++    int max_context_length, int max_q_length);
++
++void copy_blocks(
++    std::vector<torch::Tensor> &key_caches,
++    std::vector<torch::Tensor> &value_caches,
++    const std::map<int64_t, std::vector<int64_t>> &block_mapping);
++
++void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
++                           torch::Tensor &key_cache, torch::Tensor &value_cache,
++                           torch::Tensor &slot_mapping,
++                           const std::string& kv_cache_dtype, const float kv_scale);
++void reshape_and_cache_ipexllm(torch::Tensor &key, torch::Tensor &value,
++                           torch::Tensor &key_cache, torch::Tensor &value_cache,
++                           torch::Tensor &slot_mapping,
++                           const std::string& kv_cache_dtype, const float kv_scale);
++
++void reshape_and_cache_ipexllm_fp8(torch::Tensor& key, torch::Tensor& value,
++                                   torch::Tensor& key_cache,
++                                   torch::Tensor& value_cache,
++                                   torch::Tensor& slot_mapping,
++                                   const std::string& kv_cache_dtype,
++                                   const float kv_scale);
++
++void moe_align_block_size(
++  torch::Tensor topk_ids,
++  int num_experts,
++  int block_size,
++  torch::Tensor sorted_token_ids,
++  torch::Tensor experts_ids,
++  torch::Tensor num_tokens_post_pad) {
++  TORCH_CHECK(false, "moe_align_block_size is not supported on XPU.");
++}
++void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
++                     const std::map<int64_t, int64_t> &block_mapping);
++
++void gather_cached_kv(torch::Tensor &key, torch::Tensor &value,
++                          torch::Tensor &key_cache, torch::Tensor &value_cache,
++                          torch::Tensor &slot_mapping);
++
++void convert_fp8_e5m2(torch::Tensor& src_cache, torch::Tensor& dst_cache) {
++  TORCH_CHECK(false, "Quantization is not supported on XPU.");
++}
++
++void rms_norm(torch::Tensor &out, torch::Tensor &input,
++                  torch::Tensor &weight, float epsilon);
++
++void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
++                            torch::Tensor &weight, float epsilon);
++
++torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
++                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
++                       int split_k_iters) {
++  TORCH_CHECK(false, "awq_gemm is not supported on XPU.");                            
++}
++
++torch::Tensor marlin_gemm(
++    torch::Tensor& a, 
++    torch::Tensor& b_q_weight,
++    torch::Tensor& b_scales, 
++    torch::Tensor& workspace,
++    int64_t size_m, 
++    int64_t size_n, 
++    int64_t size_k) {
++  TORCH_CHECK(false, "marlin_gemm is not supported on XPU.");                            
++}
++
++torch::Tensor awq_dequantize(torch::Tensor _kernel, 
++    torch::Tensor _scaling_factors,
++    torch::Tensor _zeros,
++    int split_k_iters,
++    int thx,
++    int thy);
++
++void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat,
++                         torch::Tensor mul, torch::Tensor lookup_table) {
++  TORCH_CHECK(false, "squeezellm_gemm is not supported on XPU.");
++}
++
++torch::Tensor gptq_gemm(
++  torch::Tensor a,
++  torch::Tensor b_q_weight,
++  torch::Tensor b_gptq_qzeros,
++  torch::Tensor b_gptq_scales,
++  torch::Tensor b_g_idx,
++  bool use_exllama,
++  int bit) {
++  TORCH_CHECK(false, "gptq_gemm is not supported on XPU.");
++}
++
++void gptq_shuffle(
++  torch::Tensor q_weight,
++  torch::Tensor q_perm,
++  int bit) {
++  TORCH_CHECK(false, "gptq_shuffle is not supported on XPU.");
++}
++
++void paged_attention_gqa(
++    torch::Tensor output,
++    torch::Tensor query,
++    torch::Tensor key_cache,
++    torch::Tensor value_cache,
++    int64_t bsz,
++    int64_t num_heads,
++    int64_t num_kv_heads,
++    float scale,
++    torch::Tensor& block_tables,
++    torch::Tensor& context_lens,
++    int block_size,
++    int64_t head_dim,
++    int max_seq_len
++);
++
++
++torch::Tensor moe_forward(
++    torch::Tensor input,
++    torch::Tensor indexs,
++    torch::Tensor qweights_attr,
++    int64_t state_size,
++    int64_t output_size,
++    int64_t qtype
++);
++
++torch::Tensor fused_moe_forward(
++    torch::Tensor input,
++    torch::Tensor indexs,
++    torch::Tensor qweights1_attr,
++    torch::Tensor qweights2_attr,
++    int64_t hidden_size,
++    int64_t intermediate_size,
++    int64_t qtype
++);
++void paged_attention_gqa_fp8(torch::Tensor output, torch::Tensor query,
++                         torch::Tensor key_cache, torch::Tensor value_cache,
++                         int64_t bsz, int64_t num_heads, int64_t num_kv_heads,
++                         float scale, torch::Tensor& block_tables,
++                         torch::Tensor& context_lens, int block_size,
++                         int64_t head_dim, int max_seq_len);
+diff --git a/csrc/xpu/xpu_types.h b/csrc/xpu/xpu_types.h
+new file mode 100644
+index 000000000..23f5b805c
+--- /dev/null
++++ b/csrc/xpu/xpu_types.h
+@@ -0,0 +1,25 @@
++
++#ifndef XPU_TYPES_H
++#define XPU_TYPES_H
++
++#include <torch/extension.h>
++
++// FIXME: FP16 is not fully supported in Torch-CPU
++#define VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES(...)     \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
++  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
++  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
++
++#define VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES_FLOAT_ONLY(...) \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)        \
++  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
++
++#define VLLM_XPU_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(                                     \
++      TYPE, NAME, VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
++
++#define VLLM_XPU_DISPATCH_FLOATING_TYPES_FLOAT_ONLY(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(                                     \
++      TYPE, NAME, VLLM_XPU_DISPATCH_CASE_FLOATING_TYPES_FLOAT_ONLY(__VA_ARGS__))
++
++#endif
+\ No newline at end of file
 diff --git a/docker/Dockerfile b/docker/Dockerfile
 index d4761e84f..307e9658f 100644
 --- a/docker/Dockerfile
@@ -929,7 +8349,7 @@ index 74f5b05b2..c0203a754 100644
 -intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
 +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
 diff --git a/setup.py b/setup.py
-index 67f65d9b9..eb313b7d2 100644
+index 67f65d9b9..d2bfb2880 100644
 --- a/setup.py
 +++ b/setup.py
 @@ -56,6 +56,8 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
@@ -941,7 +8361,24 @@ index 67f65d9b9..eb313b7d2 100644
  
  def is_sccache_available() -> bool:
      return which("sccache") is not None and \
-@@ -505,7 +507,7 @@ def get_vllm_version() -> str:
+@@ -148,6 +150,7 @@ class cmake_build_ext(build_ext):
+         cmake_args = [
+             '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
++            "-DCMAKE_CXX_STANDARD=17",
+         ]
+ 
+         verbose = envs.VERBOSE
+@@ -432,7 +435,7 @@ def _is_xpu() -> bool:
+ 
+ 
+ def _build_custom_ops() -> bool:
+-    return _is_cuda() or _is_hip() or _is_cpu()
++    return _is_cuda() or _is_hip() or _is_cpu() or _is_xpu()
+ 
+ 
+ def get_rocm_version():
+@@ -505,7 +508,7 @@ def get_vllm_version() -> str:
              version += f"{sep}precompiled"
          else:
              cuda_version = str(get_nvcc_cuda_version())
@@ -950,7 +8387,7 @@ index 67f65d9b9..eb313b7d2 100644
                  cuda_version_str = cuda_version.replace(".", "")[:3]
                  # skip this for source tarball, required for pypi
                  if "sdist" not in sys.argv:
-@@ -513,7 +515,7 @@ def get_vllm_version() -> str:
+@@ -513,7 +516,7 @@ def get_vllm_version() -> str:
      elif _is_hip():
          # Get the Rocm Version
          rocm_version = get_rocm_version() or torch.version.hip
@@ -1487,10 +8924,62 @@ index 000000000..ae4909b29
 +
 +exit 0
 diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
-index c2868c040..f6627808c 100644
+index c2868c040..41ee138bc 100644
 --- a/vllm/_ipex_ops.py
 +++ b/vllm/_ipex_ops.py
-@@ -207,6 +207,12 @@ class ipex_ops:
+@@ -15,6 +15,7 @@ try:
+ except ImportError as e:
+     logger.warning("Import error msg: %s", e.msg)
+ 
++import vllm._C.ops
+ 
+ class ipex_ops:
+ 
+@@ -143,10 +144,12 @@ class ipex_ops:
+         cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+         is_neox: bool,
+     ) -> None:
+-        rot_dim = cos_sin_cache.size(1)
+-        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+-                                                     head_size, cos_sin_cache,
+-                                                     is_neox, rot_dim)
++        # rot_dim = cos_sin_cache.size(1)
++        # ipex.llm.functional.rotary_embedding_batched(positions, query, key,
++        #                                              head_size, cos_sin_cache,
++        #                                              is_neox, rot_dim)
++        import vllm._C.ops
++        vllm._C.ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+ 
+     @staticmethod
+     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+@@ -162,14 +165,23 @@ class ipex_ops:
+     @staticmethod
+     def rms_norm(input: torch.Tensor, weight: torch.Tensor,
+                  epsilon: float) -> torch.Tensor:
++        # return ipex.llm.functional.rms_norm(input, weight, epsilon)
++        tmp = torch.empty_like(input)
++        vllm._C.ops.rms_norm(tmp, input, weight, epsilon)
++        return tmp
++
++    @staticmethod
++    def rms_norm_origin(input: torch.Tensor, weight: torch.Tensor,
++                 epsilon: float) -> torch.Tensor:
+         return ipex.llm.functional.rms_norm(input, weight, epsilon)
+ 
+     @staticmethod
+     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                            weight: torch.Tensor, epsilon: float) -> None:
+-        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+-                                               epsilon, True)
+-        input.copy_(tmp)
++        # tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
++        #                                        epsilon, True)
++        # input.copy_(tmp)
++        vllm._C.ops.fused_add_rms_norm(input, residual, weight, epsilon)
+ 
+     @staticmethod
+     def varlen_attention(
+@@ -207,6 +219,12 @@ class ipex_ops:
                                                   is_causal, return_softmax,
                                                   gen_)
          else:  # XPU build
@@ -1503,7 +8992,7 @@ index c2868c040..f6627808c 100644
              ipex.llm.functional.varlen_attention(
                  query.contiguous(), key.contiguous(), value.contiguous(), out,
                  seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
-@@ -300,6 +306,7 @@ class ipex_ops:
+@@ -300,6 +318,7 @@ class ipex_ops:
              causal,
              block_table,
              alibi_slopes,
@@ -1512,7 +9001,7 @@ index c2868c040..f6627808c 100644
              window_size_left=real_window_size[0],
              window_size_right=real_window_size[1],
 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
-index bb05b468f..f1d657315 100644
+index bb05b468f..e4207020c 100644
 --- a/vllm/attention/layer.py
 +++ b/vllm/attention/layer.py
 @@ -23,6 +23,7 @@ from vllm.model_executor.layers.linear import UnquantizedLinearMethod
@@ -1654,6 +9143,94 @@ index bb05b468f..f1d657315 100644
              query, key, value = (x.transpose(1, 2)
                                   for x in (query, key, value))
              out = F.scaled_dot_product_attention(query,
+@@ -463,6 +524,87 @@ def maybe_save_kv_layer_to_connector(
+                             attn_metadata[layer_name])
+ 
+ 
++class SelfMultiHeadAttention(nn.Module):
++    """Multi-headed attention without any cache, used for ViT."""
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: Optional[int] = None,
++    ):
++        super().__init__()
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = scale
++        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++
++        self.attn_backend =  _Backend.TORCH_SDPA
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++    ) -> torch.Tensor:
++        """Input shape: batch_size x seq_len x hidden_size"""
++        # TODO(Isotr0py): Use existing backend implementations and support FA3
++        bsz, q_len, _ = query.size()
++        kv_len = key.size(1)
++
++        q = query.view(bsz, q_len, self.num_heads, self.head_size)
++        k = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
++        v = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
++        from einops import rearrange
++        q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++        from vllm._ipex_ops import ipex_ops
++        output = torch.empty(
++                    (q.shape[0], q.shape[1], q.shape[2]),
++                    dtype=q.dtype,
++                    device=q.device)
++        import math
++        head_dim = q.shape[-1]
++        scale = 1 / math.sqrt(head_dim) if self.scale is None else self.scale
++        tmp = [0]
++        tmp.append(q_len)
++        seqlen = torch.tensor(tmp)
++        cu_seqlens = torch.cumsum(seqlen, dim=0).to(device=query.device)
++        max_seqlen = q_len
++        ipex_ops.varlen_attention(q, k, v, output,
++                                cu_seqlens,
++                                cu_seqlens,
++                                None,
++                                max_seqlen,
++                                max_seqlen,
++                                pdropout=0,
++                                softmax_scale=scale,
++                                zero_tensors=False,
++                                is_causal=False,
++                                return_softmax=False,
++                                window_size_left=-1,
++                                window_size_right=-1,
++                                gen_=None,
++                                logits_soft_cap=0
++                                )
++
++        # out = rearrange(output,
++        #                             "(b s) ... -> b s ...",
++        #                             b=batch_size)
++        # query, key, value = (x.transpose(1, 2)
++        #                         for x in (query, key, value))
++        # out = F.scaled_dot_product_attention(query,
++        #                                         key,
++        #                                         value,
++        #                                         scale=self.scale)
++        # out = out.transpose(1, 2)
++
++        return output.reshape(bsz, q_len, -1)
++
++
+ def unified_attention(
+     query: torch.Tensor,
+     key: torch.Tensor,
 diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
 index a98eb2a78..14095ca4d 100644
 --- a/vllm/benchmarks/serve.py
@@ -1813,7 +9390,7 @@ index b53dbfb3a..48d205856 100644
      # 3rd priority: AutoTokenizer chat template
      try:
 diff --git a/vllm/envs.py b/vllm/envs.py
-index ac770ac4c..487fdcbfa 100755
+index ac770ac4c..c20997bbb 100755
 --- a/vllm/envs.py
 +++ b/vllm/envs.py
 @@ -70,7 +70,6 @@ if TYPE_CHECKING:
@@ -1824,16 +9401,17 @@ index ac770ac4c..487fdcbfa 100755
      MAX_JOBS: Optional[str] = None
      NVCC_THREADS: Optional[str] = None
      VLLM_USE_PRECOMPILED: bool = False
-@@ -176,6 +175,8 @@ if TYPE_CHECKING:
+@@ -176,6 +175,9 @@ if TYPE_CHECKING:
      VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
      VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
      VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
 +    VLLM_XPU_FP8_DTYPE: str = "e5m2"
-+    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = False
++    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = True
++    VLLM_QUANTIZE_Q40_LIB: str = "/opt/lib/vllm_int4_for_multi_arc.so"
  
  
  def get_default_cache_root():
-@@ -247,11 +248,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
+@@ -247,11 +249,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
      "VLLM_TARGET_DEVICE":
      lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
  
@@ -1845,7 +9423,7 @@ index ac770ac4c..487fdcbfa 100755
      # Maximum number of compilation jobs to run in parallel.
      # By default this is the number of CPUs
      "MAX_JOBS":
-@@ -1247,6 +1243,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
+@@ -1247,6 +1244,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
      # raw bytes. Defaults to True for backward compatibility.
      "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES":
      lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))),
@@ -1856,7 +9434,12 @@ index ac770ac4c..487fdcbfa 100755
 +
 +    # Offload model weights to cpu before online fp8 quantization
 +    "VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT":
-+    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "0") == "1",
++    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "1") == "1",
++
++    # Path for finding libs for vLLM sym_int4 quantization support
++    "VLLM_QUANTIZE_Q40_LIB":
++    lambda: os.environ.get("VLLM_QUANTIZE_Q40_LIB", "/opt/lib/vllm_int4_for_multi_arc.so"),
++
  }
  
  # --8<-- [end:env-vars-definition]
@@ -1880,6 +9463,59 @@ index a90a71159..5638da392 100644
          )
  
      def forward_tpu(
+diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
+index f875f712b..687629720 100644
+--- a/vllm/model_executor/layers/layernorm.py
++++ b/vllm/model_executor/layers/layernorm.py
+@@ -243,6 +243,7 @@ class RMSNorm(CustomOp):
+         self,
+         x: torch.Tensor,
+         residual: Optional[torch.Tensor] = None,
++        origin: bool = False,
+     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+         if self.variance_size_override is not None:
+             return self.forward_native(x, residual)
+@@ -257,6 +258,12 @@ class RMSNorm(CustomOp):
+                 self.variance_epsilon,
+             )
+             return x, residual
++        if origin:
++            return ops.rms_norm_origin(
++                x,
++                self.weight.data,
++                self.variance_epsilon,
++            )
+         return ops.rms_norm(
+             x,
+             self.weight.data,
+diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
+index 8cac47b5a..47577ff9c 100644
+--- a/vllm/model_executor/layers/quantization/__init__.py
++++ b/vllm/model_executor/layers/quantization/__init__.py
+@@ -35,6 +35,7 @@ QuantizationMethods = Literal[
+     "inc",
+     "mxfp4",
+     "petit_nvfp4",
++    "sym_int4"
+ ]
+ QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
+ 
+@@ -112,6 +113,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+     from .rtn import RTNConfig
+     from .torchao import TorchAOConfig
+     from .tpu_int8 import Int8TpuConfig
++    from .sym_int4 import SymInt4Config
+ 
+     method_to_config: dict[str, type[QuantizationConfig]] = {
+         "awq": AWQConfig,
+@@ -142,6 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+         "inc": INCConfig,
+         "mxfp4": Mxfp4Config,
+         "petit_nvfp4": PetitNvFp4Config,
++        "sym_int4": SymInt4Config,
+     }
+     # Update the `method_to_config` with customized quantization methods.
+     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
 index 3d94626e5..72c77e15c 100644
 --- a/vllm/model_executor/layers/quantization/fp8.py
@@ -2126,6 +9762,235 @@ index f935bdd84..9a80b80e7 100644
 +                                          activation="swiglu_oai")
 +        hidden_states = hidden_states[..., :self.original_hidden_size].contiguous()
 +        return hidden_states
+diff --git a/vllm/model_executor/layers/quantization/sym_int4.py b/vllm/model_executor/layers/quantization/sym_int4.py
+new file mode 100644
+index 000000000..10d1d3d56
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/sym_int4.py
+@@ -0,0 +1,223 @@
++# SPDX-License-Identifier: Apache-2.0
++
++from typing import Any, Dict, List, Optional, Tuple
++# from vllm.model_executor.layers.quantization import register_quantization_config
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.parameter import (BlockQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
++
++import torch
++from torch.nn import Module
++from torch.nn.parameter import Parameter
++# from vllm.model_executor.layers.quantization.ipex_quant import MIN_IPEX_VERSION
++
++from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT, VLLM_QUANTIZE_Q40_LIB
++import ctypes
++
++MIN_IPEX_VERSION = "2.5.0"
++class SymInt4Config(QuantizationConfig):
++    """SYM_INT4 quantization config class which uses IPEX kernel behind the scene...
++    The weight will be quantized according to GPTQ setups...
++    """
++    def __init__(
++        self,
++    ) -> None:
++        super().__init__()
++
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "sym_int4"
++
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half]
++
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # TODO: check if this will affect things...
++        # May need to check platform xpu
++        return -1
++
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return []
++
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "SymInt4Config":
++        return cls()
++
++    @classmethod
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        """Get the quantize method to use for the quantized layer.
++
++        Args:
++            layer: The layer for the quant method.
++            prefix: The full name of the layer in the state dict
++        Returns:
++            The quantize method. None if the given layer doesn't support quant
++            method.
++        """
++        modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
++        if any(key in prefix for key in modules_to_not_convert):
++            return UnquantizedLinearMethod()
++        if isinstance(layer, LinearBase):
++            return SymInt4LinearMethod(self)
++        else:
++            return None
++
++
++
++class SymInt4LinearMethod(LinearMethodBase):
++    def __init__(self, quant_config: SymInt4Config):
++        self.quant_config = quant_config
++        # Initialize the quant_config
++        try:
++            self.clib = ctypes.CDLL(VLLM_QUANTIZE_Q40_LIB)
++        except OSError as e:
++            raise RuntimeError(f"Failed to load required quantization lib at {VLLM_QUANTIZE_Q40_LIB}: {e}")
++        self.clib.quantize_q4_0_to_qweight_and_scale.argtypes = [
++            ctypes.POINTER(ctypes.c_float),
++            ctypes.POINTER(ctypes.c_int32),
++            ctypes.POINTER(ctypes.c_uint16),
++            ctypes.c_int,
++            ctypes.c_int,
++        ]
++        self.clib.quantize_q4_0_to_qweight_and_scale.restype = ctypes.c_size_t
++
++    def ggml_quantize_tensor(self, weight: torch.Tensor, out_qweight: torch.Tensor, out_scale:torch.Tensor, out_features: int, in_features: int):
++        # Convert src to float *
++        # Currently, only handles dimension = 2
++        assert(weight.dim()==2)
++        assert out_qweight.shape == (out_features, in_features // 8)
++        assert out_scale.shape == (out_features, in_features // 64)
++
++        assert weight.dtype == torch.float32
++        assert out_qweight.dtype == torch.int32
++        assert out_scale.dtype == torch.float16
++
++        assert(out_qweight.is_contiguous())
++        assert(out_scale.is_contiguous())
++        src = weight.data.data_ptr()
++        src = ctypes.cast(src, ctypes.POINTER(ctypes.c_float))
++
++        qweight = out_qweight.data.data_ptr()
++        qweight = ctypes.cast(qweight, ctypes.POINTER(ctypes.c_int32))
++
++        scale = out_scale.data.data_ptr()
++        scale = ctypes.cast(scale, ctypes.POINTER(ctypes.c_uint16))
++        self.clib.quantize_q4_0_to_qweight_and_scale(src, qweight, scale, out_features, in_features)
++        out_qweight = out_qweight.transpose(0,1).contiguous()
++        out_scale = out_scale.transpose(0,1).contiguous()
++        return out_qweight, out_scale
++
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        output_size_per_partition = sum(output_partition_sizes)
++        weight_loader = extra_weight_attrs.get("weight_loader")
++
++        layer.logical_widths = output_partition_sizes
++
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        layer.orig_dtype = params_dtype
++
++        weight_dtype = params_dtype
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=weight_dtype,
++            device="cpu"),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        # The same with the GPTQ's linear method by IPEX
++        reshaped_x = x.reshape(-1, x.shape[-1])
++        out = layer.ipex_qlinear(reshaped_x)
++        if bias is not None:
++            out.add_(bias)
++        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
++
++
++    def process_weights_after_loading(self, layer: Module) -> None:
++        weight = layer.weight.float()
++        out_features = layer.weight.shape[0]
++        in_features = layer.weight.shape[1]
++
++        qweight = torch.zeros((out_features, in_features // 8), dtype=torch.int32, device=layer.weight.device)
++        scale = torch.zeros((out_features, in_features // 64), dtype=torch.float16, device=layer.weight.device)
++        qweight, scale = self.ggml_quantize_tensor(weight, qweight, scale, out_features, in_features)
++        
++        qweight = qweight.to("xpu")
++        scale = scale.to("xpu")
++
++        # Use qweight to replace weight...
++        layer.weight = Parameter(qweight, requires_grad=False)
++        # qweight_scale
++        layer.weight_scale = Parameter(scale, requires_grad=False)
++        # layer.input_scale = None
++        try:
++            import intel_extension_for_pytorch as ipex
++            if ipex.__version__ < MIN_IPEX_VERSION:
++                raise ImportError(
++                    "intel_extension_for_pytorch version is "
++                    "wrong. Please install "
++                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
++        except ImportError as err:
++            raise ImportError(
++                "Please install "
++                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
++                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
++                " to use IPEX-AWQ linear method.") from err
++        lowp_mode = ipex.quantization.WoqLowpMode.INT8
++        # The weight will be de-packed from INT4 to INT8.
++        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
++        # The float activation will be quantized (dynamic, per-token) to INT8.
++        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
++        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
++            weight_dtype=weight_dtype,
++            lowp_mode=lowp_mode,
++            act_quant_mode=act_quant_mode,
++            group_size=64,
++        )
++        layer.ipex_output_size = layer.weight.shape[-1]
++        g_idx = None
++        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
++            IPEXWeightOnlyQuantizedLinear.from_weight(
++            layer.weight,     # weight should be on xpu...
++            layer.weight_scale,
++            torch.tensor([8], device=layer.weight.device, dtype=torch.int8),
++            layer.weight.size(0),
++            layer.ipex_output_size,
++            qconfig=qconfig,
++            g_idx=g_idx,
++            bias=None,
++            group_size=64,
++            # For GPTQ layout
++            quant_method=0
++        )
 diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
 index 564f9a5c0..c9653aa9e 100644
 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -2890,7 +10755,7 @@ index 0acb5ea74..c4b8c66eb 100644
          updates = [audio_start_token_id]
          added_audio_len = 0
 diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
-index 0c2441a6d..d1747f2d3 100644
+index 0c2441a6d..828ad2acf 100644
 --- a/vllm/model_executor/model_loader/utils.py
 +++ b/vllm/model_executor/model_loader/utils.py
 @@ -15,6 +15,7 @@ from typing_extensions import assert_never
@@ -2901,7 +10766,77 @@ index 0c2441a6d..d1747f2d3 100644
  from vllm.logger import init_logger
  from vllm.model_executor.layers.linear import QKVCrossParallelLinear
  from vllm.model_executor.layers.quantization.base_config import (
-@@ -144,26 +145,30 @@ def device_loading_context(module: torch.nn.Module,
+@@ -25,6 +26,7 @@ from vllm.model_executor.models.adapters import (
+ from vllm.model_executor.models.interfaces import (SupportsQuant,
+                                                    supports_multimodal)
+ from vllm.utils import is_pin_memory_available
++from vllm.model_executor.layers.quantization.sym_int4 import SymInt4LinearMethod
+ 
+ logger = init_logger(__name__)
+ 
+@@ -96,7 +98,9 @@ def initialize_model(
+ 
+ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+                                   target_device: torch.device) -> None:
+-    for _, module in model.named_modules():
++    # gc: Any changes here need to be added to SymInt4Config.get_quant_method
++    modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
++    for name, module in model.named_modules():
+         if isinstance(module, QKVCrossParallelLinear):
+             # NOTE(Isotr0py): special case for cross QKV layer because
+             # q and kv proj aren't registered as submodules intentionally
+@@ -104,12 +108,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+             continue
+         quant_method = getattr(module, "quant_method", None)
+         if isinstance(quant_method, QuantizeMethodBase):
+-            # When quant methods need to process weights after loading
+-            # (for repacking, quantizing, etc), they expect parameters
+-            # to be on the global target device. This scope is for the
+-            # case where cpu offloading is used, where we will move the
+-            # parameters onto device for processing and back off after.
+-            with device_loading_context(module, target_device):
++            # The quantization of SYM_INT4 happens on CPU instead of XPU.
++            # We uses the parameter quantization_on_cpu=isinstance(quant_method, SymInt4LinearMethod)
++            # to skip moving tensors to XPU
++            with device_loading_context(module, target_device, isinstance(quant_method, SymInt4LinearMethod)):
++                # When quant methods need to process weights after loading
++                # (for repacking, quantizing, etc), they expect parameters
++                # to be on the global target device. This scope is for the
++                # case where cpu offloading is used, where we will move the
++                # parameters onto device for processing and back off after.
++                if any(key in name for key in modules_to_not_convert):
++                    continue
++
+                 quant_method.process_weights_after_loading(module)
+ 
+     # Currently only used by MLA.
+@@ -125,7 +135,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+ 
+ @contextmanager
+ def device_loading_context(module: torch.nn.Module,
+-                           target_device: torch.device):
++                           target_device: torch.device,
++                           quantization_on_cpu: False):
+     if target_device.type == "cpu":
+         # If target is CPU, no need to move anything
+         yield module
+@@ -134,36 +145,41 @@ def device_loading_context(module: torch.nn.Module,
+     original_device_states: dict[str, torch.device] = {}
+ 
+     # Store original device states and move parameters to GPU if they're on CPU
+-    for name, p in module.named_parameters():
+-        if p.device.type == "cpu":
+-            original_device_states[name] = p.device
+-            p.data = p.data.to(target_device)
+-        # Parameters already on target device are not touched
++    if not quantization_on_cpu:
++        for name, p in module.named_parameters():
++            if p.device.type == "cpu":
++                original_device_states[name] = p.device
++                p.data = p.data.to(target_device)
++            # Parameters already on target device are not touched
+ 
+     try:
          yield module
  
      finally:
@@ -2952,6 +10887,19 @@ index 0c2441a6d..d1747f2d3 100644
  
  
  def get_model_architecture(
+diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
+index e4a21febc..eec5b3ed6 100644
+--- a/vllm/model_executor/models/deepseek_v2.py
++++ b/vllm/model_executor/models/deepseek_v2.py
+@@ -448,7 +448,7 @@ class DeepseekV2Attention(nn.Module):
+         kv_a, _ = latent_cache.split(
+             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+         latent_cache = latent_cache.unsqueeze(1)
+-        kv_a = self.kv_a_layernorm(kv_a)
++        kv_a = self.kv_a_layernorm(kv_a, origin=True)
+         kv = self.kv_b_proj(kv_a)[0]
+         kv = kv.view(-1, self.num_local_heads,
+                      self.qk_nope_head_dim + self.v_head_dim)
 diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
 new file mode 100644
 index 000000000..f24cb6d52
@@ -3875,7 +11823,7 @@ index 97aace5a2..bcff65a71 100644
      @property
      def dtype(self) -> torch.dtype:
 diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
-index 539381b61..279f458df 100644
+index 539381b61..a1e699310 100644
 --- a/vllm/model_executor/models/glm4_1v.py
 +++ b/vllm/model_executor/models/glm4_1v.py
 @@ -44,6 +44,7 @@ from transformers.models.glm4v.video_processing_glm4v import (
@@ -3886,7 +11834,7 @@ index 539381b61..279f458df 100644
  from vllm.config import VllmConfig
  from vllm.distributed import (get_tensor_model_parallel_world_size,
                                parallel_state)
-@@ -260,7 +261,15 @@ class Glm4vVisionAttention(nn.Module):
+@@ -260,7 +261,16 @@ class Glm4vVisionAttention(nn.Module):
          )
  
          # Detect attention implementation.
@@ -3900,10 +11848,11 @@ index 539381b61..279f458df 100644
 +            self.attn_backend = _Backend.FLASH_ATTN
 +            self.use_upstream_fa = True
 +
++        self.attn_backend = _Backend.TORCH_SDPA
          if self.attn_backend not in {
                  _Backend.FLASH_ATTN,
                  _Backend.TORCH_SDPA,
-@@ -310,7 +319,10 @@ class Glm4vVisionAttention(nn.Module):
+@@ -310,7 +320,10 @@ class Glm4vVisionAttention(nn.Module):
          if self.attn_backend == _Backend.FLASH_ATTN:
              # from vllm_flash_attn.flash_attn_interface import (
              #   flash_attn_varlen_func)
@@ -3915,7 +11864,75 @@ index 539381b61..279f458df 100644
  
              q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
-@@ -715,7 +727,11 @@ class Glm4vVisionTransformer(nn.Module):
+@@ -331,22 +344,51 @@ class Glm4vVisionAttention(nn.Module):
+                                       b=batch_size)
+         elif self.attn_backend == _Backend.TORCH_SDPA:
+             # Execute attention entry by entry for speed & less VRAM.
+-            outputs = []
+-            for i in range(1, len(cu_seqlens)):
+-                start_idx = cu_seqlens[i - 1]
+-                end_idx = cu_seqlens[i]
+-                q_i = q[:, start_idx:end_idx]
+-                k_i = k[:, start_idx:end_idx]
+-                v_i = v[:, start_idx:end_idx]
+-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+-                                 for x in [q_i, k_i, v_i])
+-                output_i = F.scaled_dot_product_attention(q_i,
+-                                                          k_i,
+-                                                          v_i,
+-                                                          dropout_p=0.0)
+-                output_i = rearrange(output_i, "b h s d -> b s h d ")
+-                outputs.append(output_i)
+-            context_layer = torch.cat(outputs, dim=1)
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++            from vllm._ipex_ops import ipex_ops
++            output = torch.empty(
++                        (q.shape[0], q.shape[1], q.shape[2]),
++                        dtype=q.dtype,
++                        device=q.device)
++            import math
++            head_dim = q.shape[-1]
++            scale = 1 / math.sqrt(head_dim)
++            ipex_ops.varlen_attention(q, k, v, output,
++                                    cu_seqlens,
++                                    cu_seqlens,
++                                    None,
++                                    max_seqlen,
++                                    max_seqlen,
++                                    pdropout=0,
++                                    softmax_scale=scale,
++                                    zero_tensors=False,
++                                    is_causal=False,
++                                    return_softmax=False,
++                                    window_size_left=-1,
++                                    window_size_right=-1,
++                                    gen_=None,
++                                    logits_soft_cap=0
++                                    )
++
++            context_layer = rearrange(output,
++                                      "(b s) ... -> b s ...",
++                                      b=batch_size)
++            # outputs = []
++            # for i in range(1, len(cu_seqlens)):
++            #     start_idx = cu_seqlens[i - 1]
++            #     end_idx = cu_seqlens[i]
++            #     q_i = q[:, start_idx:end_idx]
++            #     k_i = k[:, start_idx:end_idx]
++            #     v_i = v[:, start_idx:end_idx]
++            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
++            #                      for x in [q_i, k_i, v_i])
++            #     output_i = F.scaled_dot_product_attention(q_i,
++            #                                               k_i,
++            #                                               v_i,
++            #                                               dropout_p=0.0)
++            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
++            #     outputs.append(output_i)
++            # context_layer = torch.cat(outputs, dim=1)
+         elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
+             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+@@ -715,7 +757,11 @@ class Glm4vVisionTransformer(nn.Module):
          self.post_layernorm = RMSNorm(vision_config.hidden_size,
                                        eps=vision_config.rms_norm_eps)
  
@@ -3928,6 +11945,50 @@ index 539381b61..279f458df 100644
  
      @property
      def dtype(self) -> torch.dtype:
+@@ -756,7 +802,7 @@ class Glm4vVisionTransformer(nn.Module):
+     ) -> tuple[Optional[int], Optional[list[int]]]:
+         max_seqlen, seqlens = None, None
+         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+-        if self.attn_backend == _Backend.FLASH_ATTN:
++        if self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.IPEX:
+             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         return max_seqlen, seqlens
+ 
+diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
+index 1fb457609..d1dc00a62 100644
+--- a/vllm/model_executor/models/glm4_moe.py
++++ b/vllm/model_executor/models/glm4_moe.py
+@@ -626,7 +626,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+         for layer in self.model.layers:
+             if isinstance(layer, PPMissingLayer):
+                 continue
+-
+             assert isinstance(layer, Glm4MoeDecoderLayer)
+             if isinstance(layer.mlp, Glm4MoE):
+                 # Pick last one layer since the first ones may be dense layers.
+diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
+index bf3357585..d44dd9797 100644
+--- a/vllm/model_executor/models/glm4v.py
++++ b/vllm/model_executor/models/glm4v.py
+@@ -18,6 +18,7 @@ from transformers.image_utils import ImageInput
+ from transformers.tokenization_utils_base import TextInput
+ 
+ from vllm.attention.layer import MultiHeadAttention
++from vllm.attention.layer import SelfMultiHeadAttention
+ from vllm.config import VllmConfig
+ from vllm.distributed import get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+@@ -119,7 +120,9 @@ class EVA2CLIPAttention(nn.Module):
+             prefix=f"{prefix}.dense",
+         )
+ 
+-        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
++        # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
++        #                                self.scale)
++        self.attn = SelfMultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                        self.scale)
+         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
+ 
 diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
 index e0b4df772..d85d30d91 100644
 --- a/vllm/model_executor/models/gpt_oss.py
@@ -3942,6 +12003,36 @@ index e0b4df772..d85d30d91 100644
              if ".w13_weight_scale" in name:
                  # Handle MLP gate and up projection weights scale
                  if use_ep:
+diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
+index 76737a442..4c337f4f1 100644
+--- a/vllm/model_executor/models/idefics2_vision_model.py
++++ b/vllm/model_executor/models/idefics2_vision_model.py
+@@ -27,7 +27,8 @@ from transformers.models.idefics2.configuration_idefics2 import (
+     Idefics2Config, Idefics2VisionConfig)
+ 
+ from vllm.attention.layer import MultiHeadAttention
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.attention.layer import SelfMultiHeadAttention
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+@@ -154,9 +155,12 @@ class Idefics2VisionAttention(nn.Module):
+             prefix=f"{prefix}.out_proj",
+             disable_tp=use_data_parallel,
+         )
+-        # Use unified MultiHeadAttention with Flash Attention support
+-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+-                                       self.head_dim, self.scale)
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
++        #                                self.head_dim, self.scale)
++        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition, self.head_dim,
++                                       self.scale)
+ 
+     def forward(
+         self,
 diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
 index 710b805ac..04824db1b 100644
 --- a/vllm/model_executor/models/keye.py
@@ -4029,7 +12120,7 @@ index 54dc0bebd..e13e87b93 100644
          quant_config = vllm_config.quant_config
  
 diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
-index 8aa777557..429516cce 100644
+index 8aa777557..bb4a8bbce 100644
 --- a/vllm/model_executor/models/qwen2_5_vl.py
 +++ b/vllm/model_executor/models/qwen2_5_vl.py
 @@ -38,6 +38,7 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
@@ -4040,7 +12131,7 @@ index 8aa777557..429516cce 100644
  from vllm.config import VllmConfig
  from vllm.distributed import parallel_state
  from vllm.distributed import utils as dist_utils
-@@ -298,10 +299,19 @@ class Qwen2_5_VisionAttention(nn.Module):
+@@ -298,10 +299,13 @@ class Qwen2_5_VisionAttention(nn.Module):
                                        disable_tp=use_data_parallel)
  
          # Detect attention implementation.
@@ -4048,12 +12139,6 @@ index 8aa777557..429516cce 100644
 +        self.attn_backend = get_vit_attn_backend(
 +            head_size=self.hidden_size_per_attention_head,
 +            dtype=torch.get_default_dtype())
-+        self.use_upstream_fa = False
-+        if self.attn_backend != _Backend.FLASH_ATTN and \
-+            check_upstream_fa_availability(
-+                torch.get_default_dtype()):
-+            self.attn_backend = _Backend.FLASH_ATTN
-+            self.use_upstream_fa = True
 +
          if self.attn_backend not in {
                  _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
@@ -4062,7 +12147,7 @@ index 8aa777557..429516cce 100644
          }:
              raise RuntimeError(
                  f"Qwen2.5-VL does not support {self.attn_backend} backend now."
-@@ -359,7 +369,10 @@ class Qwen2_5_VisionAttention(nn.Module):
+@@ -359,7 +363,10 @@ class Qwen2_5_VisionAttention(nn.Module):
              if self.attn_backend == _Backend.ROCM_AITER_FA:
                  from aiter import flash_attn_varlen_func
              else:
@@ -4074,7 +12159,7 @@ index 8aa777557..429516cce 100644
  
              q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
-@@ -376,6 +389,38 @@ class Qwen2_5_VisionAttention(nn.Module):
+@@ -376,24 +383,86 @@ class Qwen2_5_VisionAttention(nn.Module):
              context_layer = rearrange(output,
                                        "(b s) ... -> b s ...",
                                        b=batch_size)
@@ -4112,8 +12197,72 @@ index 8aa777557..429516cce 100644
 +                            b=batch_size)
          elif self.attn_backend == _Backend.TORCH_SDPA:
              # Execute attention entry by entry for speed & less VRAM.
-             outputs = []
-@@ -628,7 +673,12 @@ class Qwen2_5_VisionTransformer(nn.Module):
+-            outputs = []
+-            for i in range(1, len(cu_seqlens)):
+-                start_idx = cu_seqlens[i - 1]
+-                end_idx = cu_seqlens[i]
+-                q_i = q[:, start_idx:end_idx]
+-                k_i = k[:, start_idx:end_idx]
+-                v_i = v[:, start_idx:end_idx]
+-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+-                                 for x in [q_i, k_i, v_i])
+-                output_i = F.scaled_dot_product_attention(q_i,
+-                                                          k_i,
+-                                                          v_i,
+-                                                          dropout_p=0.0)
+-                output_i = rearrange(output_i, "b h s d -> b s h d ")
+-                outputs.append(output_i)
+-            context_layer = torch.cat(outputs, dim=1)
++            # TODO(xiangyu): Maybe add attn_backend xpu?
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++            from vllm._ipex_ops import ipex_ops
++            output = torch.empty(
++                        (q.shape[0], q.shape[1], q.shape[2]),
++                        dtype=q.dtype,
++                        device=q.device)
++            import math
++            head_dim = q.shape[-1]
++            scale = 1 / math.sqrt(head_dim)
++            ipex_ops.varlen_attention(q, k, v, output,
++                                    cu_seqlens,
++                                    cu_seqlens,
++                                    None,
++                                    max_seqlen,
++                                    max_seqlen,
++                                    pdropout=0,
++                                    softmax_scale=scale,
++                                    zero_tensors=False,
++                                    is_causal=False,
++                                    return_softmax=False,
++                                    window_size_left=-1,
++                                    window_size_right=-1,
++                                    gen_=None,
++                                    logits_soft_cap=0
++                                    )
++
++            context_layer = rearrange(output,
++                                      "(b s) ... -> b s ...",
++                                      b=batch_size)
++            # outputs = []
++            # for i in range(1, len(cu_seqlens)):
++            #     start_idx = cu_seqlens[i - 1]
++            #     end_idx = cu_seqlens[i]
++            #     q_i = q[:, start_idx:end_idx]
++            #     k_i = k[:, start_idx:end_idx]
++            #     v_i = v[:, start_idx:end_idx]
++            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
++            #                      for x in [q_i, k_i, v_i])
++            #     output_i = F.scaled_dot_product_attention(q_i,
++            #                                               k_i,
++            #                                               v_i,
++            #                                               dropout_p=0.0)
++            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
++            #     outputs.append(output_i)
++            # context_layer = torch.cat(outputs, dim=1)
+         elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
+             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+@@ -628,7 +697,12 @@ class Qwen2_5_VisionTransformer(nn.Module):
              prefix=f"{prefix}.merger",
              use_data_parallel=use_data_parallel,
          )
@@ -4127,7 +12276,7 @@ index 8aa777557..429516cce 100644
  
      @property
      def dtype(self) -> torch.dtype:
-@@ -714,6 +764,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
+@@ -714,6 +788,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
              max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
          elif self.attn_backend == _Backend.XFORMERS:
              seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -4136,7 +12285,7 @@ index 8aa777557..429516cce 100644
          return max_seqlen, seqlens
  
      @staticmethod
-@@ -1210,10 +1262,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+@@ -1210,10 +1286,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
              if image_input is None and video_input is None:
                  inputs_embeds = None
              else:
@@ -4152,7 +12301,7 @@ index 8aa777557..429516cce 100644
                      input_ids,
                      image_input=image_input,
 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
-index 90a1ad2a6..e6da04df4 100644
+index 90a1ad2a6..8ba467034 100644
 --- a/vllm/model_executor/models/qwen2_vl.py
 +++ b/vllm/model_executor/models/qwen2_vl.py
 @@ -41,6 +41,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
@@ -4172,7 +12321,7 @@ index 90a1ad2a6..e6da04df4 100644
  
  # === Vision Inputs === #
  
-@@ -314,10 +315,19 @@ class Qwen2VisionAttention(nn.Module):
+@@ -314,10 +315,13 @@ class Qwen2VisionAttention(nn.Module):
                                        prefix=f"{prefix}.proj")
  
          # Detect attention implementation.
@@ -4180,12 +12329,6 @@ index 90a1ad2a6..e6da04df4 100644
 +        self.attn_backend = get_vit_attn_backend(
 +            head_size=self.hidden_size_per_attention_head,
 +            dtype=torch.get_default_dtype())
-+        self.use_upstream_fa = False
-+        if self.attn_backend != _Backend.FLASH_ATTN and \
-+            check_upstream_fa_availability(
-+                torch.get_default_dtype()):
-+            self.attn_backend = _Backend.FLASH_ATTN
-+            self.use_upstream_fa = True
 +
          if self.attn_backend not in {
                  _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
@@ -4194,7 +12337,7 @@ index 90a1ad2a6..e6da04df4 100644
          }:
              raise RuntimeError(
                  f"Qwen2-VL does not support {self.attn_backend} backend now.")
-@@ -374,7 +384,10 @@ class Qwen2VisionAttention(nn.Module):
+@@ -374,7 +378,10 @@ class Qwen2VisionAttention(nn.Module):
              if self.attn_backend == _Backend.ROCM_AITER_FA:
                  from aiter import flash_attn_varlen_func
              else:
@@ -4206,7 +12349,7 @@ index 90a1ad2a6..e6da04df4 100644
  
              q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
-@@ -391,6 +404,38 @@ class Qwen2VisionAttention(nn.Module):
+@@ -391,24 +398,69 @@ class Qwen2VisionAttention(nn.Module):
              context_layer = rearrange(output,
                                        "(b s) ... -> b s ...",
                                        b=batch_size)
@@ -4244,8 +12387,55 @@ index 90a1ad2a6..e6da04df4 100644
 +                            b=batch_size)
          elif self.attn_backend == _Backend.TORCH_SDPA:
              # Execute attention entry by entry for speed & less VRAM.
-             outputs = []
-@@ -628,7 +673,12 @@ class Qwen2VisionTransformer(nn.Module):
+-            outputs = []
+-            for i in range(1, len(cu_seqlens)):
+-                start_idx = cu_seqlens[i - 1]
+-                end_idx = cu_seqlens[i]
+-                q_i = q[:, start_idx:end_idx]
+-                k_i = k[:, start_idx:end_idx]
+-                v_i = v[:, start_idx:end_idx]
+-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+-                                 for x in [q_i, k_i, v_i])
+-                output_i = F.scaled_dot_product_attention(q_i,
+-                                                          k_i,
+-                                                          v_i,
+-                                                          dropout_p=0.0)
+-                output_i = rearrange(output_i, "b h s d -> b s h d ")
+-                outputs.append(output_i)
+-            context_layer = torch.cat(outputs, dim=1)
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++            from vllm._ipex_ops import ipex_ops
++            output = torch.empty(
++                        (q.shape[0], q.shape[1], q.shape[2]),
++                        dtype=q.dtype,
++                        device=q.device)
++            import math
++            head_dim = q.shape[-1]
++            scale = 1 / math.sqrt(head_dim)
++            ipex_ops.varlen_attention(q, k, v, output,
++                                    cu_seqlens,
++                                    cu_seqlens,
++                                    None,
++                                    max_seqlen,
++                                    max_seqlen,
++                                    pdropout=0,
++                                    softmax_scale=scale,
++                                    zero_tensors=False,
++                                    is_causal=False,
++                                    return_softmax=False,
++                                    window_size_left=-1,
++                                    window_size_right=-1,
++                                    gen_=None,
++                                    logits_soft_cap=0
++                                    )
++
++            context_layer = rearrange(output,
++                                    "(b s) ... -> b s ...",
++                                    b=batch_size)
+         elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
+             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+@@ -628,7 +680,12 @@ class Qwen2VisionTransformer(nn.Module):
              quant_config=quant_config,
              prefix=f"{prefix}.merger",
          )
@@ -4259,7 +12449,7 @@ index 90a1ad2a6..e6da04df4 100644
  
      @property
      def dtype(self) -> torch.dtype:
-@@ -672,6 +722,8 @@ class Qwen2VisionTransformer(nn.Module):
+@@ -672,6 +729,8 @@ class Qwen2VisionTransformer(nn.Module):
              max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
          elif self.attn_backend == _Backend.XFORMERS:
              seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -4268,11 +12458,46 @@ index 90a1ad2a6..e6da04df4 100644
          return max_seqlen, seqlens
  
      def forward(
+diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
+index dddb47048..0e8bf0d32 100644
+--- a/vllm/model_executor/models/qwen3.py
++++ b/vllm/model_executor/models/qwen3.py
+@@ -147,11 +147,11 @@ class Qwen3Attention(nn.Module):
+         # Add qk-norm
+         q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                            self.head_dim)
+-        q_by_head = self.q_norm(q_by_head)
++        q_by_head = self.q_norm.forward(q_by_head.contiguous())
+         q = q_by_head.view(q.shape)
+         k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                            self.head_dim)
+-        k_by_head = self.k_norm(k_by_head)
++        k_by_head = self.k_norm.forward(k_by_head.contiguous())
+         k = k_by_head.view(k.shape)
+         q, k = self.rotary_emb(positions, q, k)
+         attn_output = self.attn(q, k, v)
 diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
-index 85429b3a0..0a504d90c 100644
+index 85429b3a0..b4336c633 100644
 --- a/vllm/model_executor/models/qwen3_moe.py
 +++ b/vllm/model_executor/models/qwen3_moe.py
-@@ -378,7 +378,7 @@ class Qwen3MoeModel(nn.Module):
+@@ -277,12 +277,14 @@ class Qwen3MoeAttention(nn.Module):
+         # Add qk-norm
+         q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                            self.head_dim)
+-        q_by_head = self.q_norm(q_by_head)
++        q_by_head = self.q_norm.forward(q_by_head.contiguous())
++        # q_by_head = self.q_norm(q_by_head)
+         q = q_by_head.view(q.shape)
+ 
+         k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                            self.head_dim)
+-        k_by_head = self.k_norm(k_by_head)
++        k_by_head = self.k_norm.forward(k_by_head.contiguous())
++        # k_by_head = self.k_norm(k_by_head)
+         k = k_by_head.view(k.shape)
+         q, k = self.rotary_emb(positions, q, k)
+         attn_output = self.attn(q, k, v)
+@@ -378,7 +380,7 @@ class Qwen3MoeModel(nn.Module):
      def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
          super().__init__()
  
@@ -7670,6 +15895,29 @@ index 7d7654e84..81e460413 100644
      "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
      # [Encoder-decoder]
      "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
+diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
+index 3630f59f5..62566d8f6 100644
+--- a/vllm/model_executor/models/siglip.py
++++ b/vllm/model_executor/models/siglip.py
+@@ -12,6 +12,7 @@ from torch import nn
+ from transformers import SiglipVisionConfig
+ 
+ from vllm.attention.layer import MultiHeadAttention
++from vllm.attention.layer import SelfMultiHeadAttention
+ from vllm.distributed import divide, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+@@ -177,7 +178,9 @@ class SiglipAttention(nn.Module):
+         self.tp_size = get_tensor_model_parallel_world_size()
+         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+ 
+-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
++        #                                self.head_dim, self.scale)
++        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition,
+                                        self.head_dim, self.scale)
+ 
+     def forward(
 diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
 index c6244fb3b..a86700fe6 100644
 --- a/vllm/model_executor/models/siglip2navit.py
@@ -8445,7 +16693,7 @@ index 5e00f6380..c296b3f28 100644
          except Exception as e:
              raise ValueError("Invalid structural tag specification.") from e
 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
-index ebb18e81c..822fa16bb 100644
+index ebb18e81c..9055c6610 100644
 --- a/vllm/v1/worker/gpu_model_runner.py
 +++ b/vllm/v1/worker/gpu_model_runner.py
 @@ -246,6 +246,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
@@ -8534,7 +16782,17 @@ index ebb18e81c..822fa16bb 100644
                      seq_lens=self.seq_lens.gpu[:num_reqs],
                      seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
                      num_computed_tokens_cpu=self.input_batch.
-@@ -3025,11 +3055,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+@@ -2997,6 +3027,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     max_mm_items_per_batch = mm_budget \
+                         .max_items_per_batch_by_modality[dummy_modality]
+ 
++                    if self.model_config.hf_config.model_type == "minicpmv":
++                        max_mm_items_per_batch = 1
++
+                     logger.info(
+                         "Encoder cache will be initialized with a budget of "
+                         "%s tokens, and profiled with %s %s items of the "
+@@ -3025,11 +3058,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                      # Cache the dummy encoder outputs.
                      self.encoder_cache["tmp"] = dict(
                          enumerate(dummy_encoder_outputs))
@@ -8548,9 +16806,18 @@ index ebb18e81c..822fa16bb 100644
                  output = self._dummy_pooler_run(hidden_states)
              else:
 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
-index 37dd431fd..6bf9abe0b 100644
+index 37dd431fd..5a4e63177 100644
 --- a/vllm/v1/worker/gpu_worker.py
 +++ b/vllm/v1/worker/gpu_worker.py
+@@ -395,7 +395,7 @@ class Worker(WorkerBase):
+         # fragmentation issue.
+         # NOTE: This is called after `capture_model` on purpose to prevent
+         # memory buffers from being cleared by `torch.cuda.empty_cache`.
+-        if get_pp_group().is_last_rank:
++        if get_pp_group().is_last_rank and get_pp_group().world_size > 1:
+             max_num_reqs = min(self.scheduler_config.max_num_seqs,
+                                self.scheduler_config.max_num_batched_tokens)
+ 
 @@ -407,7 +407,7 @@ class Worker(WorkerBase):
                  )
              if self.model_runner.is_pooling_model:
@@ -8576,7 +16843,7 @@ index fb892211f..ea69bc2c8 100644
      finally:
          # if anything goes wrong, just patch it with a placeholder
 diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
-index 7355206f3..cf752c458 100644
+index 7355206f3..ff7870be6 100644
 --- a/vllm/v1/worker/xpu_worker.py
 +++ b/vllm/v1/worker/xpu_worker.py
 @@ -83,9 +83,11 @@ class XPUWorker(Worker):
@@ -8591,7 +16858,7 @@ index 7355206f3..cf752c458 100644
          .. tip::
              You may limit the usage of GPU memory
              by adjusting the `gpu_memory_utilization` parameter.
-@@ -93,51 +95,35 @@ class XPUWorker(Worker):
+@@ -93,51 +95,37 @@ class XPUWorker(Worker):
          # Profile the memory usage of the model and get the maximum number of
          # cache blocks that can be allocated with the remaining free memory.
          torch.xpu.empty_cache()
@@ -8653,10 +16920,12 @@ index 7355206f3..cf752c458 100644
 -               f"free gpu memory is {free_gpu_memory / 1024**2:.2f} MB.")
 -        logger.info(msg)
 -
++        # self.cache_config.threshold_mem = reserved_memory + available_kv_cache_memory
++        self.cache_config.threshold_mem = total_gpu_memory * 0.97
          return int(available_kv_cache_memory)
  
      def init_device(self):
-@@ -153,11 +139,9 @@ class XPUWorker(Worker):
+@@ -153,11 +141,9 @@ class XPUWorker(Worker):
              raise RuntimeError(
                  f"Not support device type: {self.device_config.device}")
  

From 5ffcbab8cb0a5f06bb59edd99e876092bf2a50a7 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Sun, 2 Nov 2025 11:31:59 +0800
Subject: [PATCH 7/8] update patches

Signed-off-by: gc-fu <guancheng.fu@intel.com>
---
 vllm/patches/vllm_for_multi_arc.patch | 36 ++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index e910875..0ece512 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -9517,10 +9517,27 @@ index 8cac47b5a..47577ff9c 100644
      # Update the `method_to_config` with customized quantization methods.
      method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
-index 3d94626e5..72c77e15c 100644
+index 3d94626e5..7a85c7490 100644
 --- a/vllm/model_executor/layers/quantization/fp8.py
 +++ b/vllm/model_executor/layers/quantization/fp8.py
-@@ -309,10 +309,14 @@ class Fp8LinearMethod(LinearMethodBase):
+@@ -143,6 +143,7 @@ class Fp8Config(QuantizationConfig):
+         from vllm.attention.layer import Attention
+         from vllm.model_executor.layers.quantization.ipex_quant import (
+             XPUFp8LinearMethod, XPUFp8MoEMethod)
++        modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
+         fp8_config = Fp8Config(
+             is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
+             activation_scheme=self.activation_scheme,
+@@ -152,7 +153,7 @@ class Fp8Config(QuantizationConfig):
+         if isinstance(layer, LinearBase):
+             if is_layer_skipped(prefix=prefix,
+                                 ignored_layers=self.ignored_layers,
+-                                fused_mapping=self.packed_modules_mapping):
++                                fused_mapping=self.packed_modules_mapping) or any(key in prefix for key in modules_to_not_convert):
+                 return UnquantizedLinearMethod()
+             return XPUFp8LinearMethod(fp8_config)
+         elif isinstance(layer, FusedMoE):
+@@ -309,10 +310,14 @@ class Fp8LinearMethod(LinearMethodBase):
                          if self.quant_config.is_checkpoint_fp8_serialized else
                          params_dtype)
  
@@ -9536,7 +9553,7 @@ index 3d94626e5..72c77e15c 100644
                                        input_dim=1,
                                        output_dim=0,
                                        weight_loader=weight_loader)
-@@ -631,8 +635,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+@@ -631,8 +636,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
              num_experts,
              2 * intermediate_size_per_partition,
              hidden_size,
@@ -9548,7 +9565,7 @@ index 3d94626e5..72c77e15c 100644
          layer.register_parameter("w13_weight", w13_weight)
          set_weight_attrs(w13_weight, extra_weight_attrs)
  
-@@ -640,7 +646,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+@@ -640,7 +647,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
              num_experts,
              hidden_size,
              intermediate_size_per_partition,
@@ -10755,7 +10772,7 @@ index 0acb5ea74..c4b8c66eb 100644
          updates = [audio_start_token_id]
          added_audio_len = 0
 diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
-index 0c2441a6d..828ad2acf 100644
+index 0c2441a6d..2e7032094 100644
 --- a/vllm/model_executor/model_loader/utils.py
 +++ b/vllm/model_executor/model_loader/utils.py
 @@ -15,6 +15,7 @@ from typing_extensions import assert_never
@@ -10774,18 +10791,19 @@ index 0c2441a6d..828ad2acf 100644
  
  logger = init_logger(__name__)
  
-@@ -96,7 +98,9 @@ def initialize_model(
+@@ -96,7 +98,10 @@ def initialize_model(
  
  def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
                                    target_device: torch.device) -> None:
 -    for _, module in model.named_modules():
 +    # gc: Any changes here need to be added to SymInt4Config.get_quant_method
++    # gc: Any changes here need to be added to Fp8Config.get_quant_method
 +    modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
 +    for name, module in model.named_modules():
          if isinstance(module, QKVCrossParallelLinear):
              # NOTE(Isotr0py): special case for cross QKV layer because
              # q and kv proj aren't registered as submodules intentionally
-@@ -104,12 +108,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+@@ -104,12 +109,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
              continue
          quant_method = getattr(module, "quant_method", None)
          if isinstance(quant_method, QuantizeMethodBase):
@@ -10810,7 +10828,7 @@ index 0c2441a6d..828ad2acf 100644
                  quant_method.process_weights_after_loading(module)
  
      # Currently only used by MLA.
-@@ -125,7 +135,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+@@ -125,7 +136,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
  
  @contextmanager
  def device_loading_context(module: torch.nn.Module,
@@ -10820,7 +10838,7 @@ index 0c2441a6d..828ad2acf 100644
      if target_device.type == "cpu":
          # If target is CPU, no need to move anything
          yield module
-@@ -134,36 +145,41 @@ def device_loading_context(module: torch.nn.Module,
+@@ -134,36 +146,41 @@ def device_loading_context(module: torch.nn.Module,
      original_device_states: dict[str, torch.device] = {}
  
      # Store original device states and move parameters to GPU if they're on CPU

From 0b1cbf82c9bc98b1330019c17742a57f799f0f28 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Mon, 3 Nov 2025 09:48:24 +0800
Subject: [PATCH 8/8] update patches

Signed-off-by: gc-fu <guancheng.fu@intel.com>
---
 vllm/patches/vllm_for_multi_arc.patch | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index 0ece512..c35c5b4 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -11840,6 +11840,19 @@ index 97aace5a2..bcff65a71 100644
  
      @property
      def dtype(self) -> torch.dtype:
+diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
+index 5e2908a82..adbe89007 100644
+--- a/vllm/model_executor/models/glm4.py
++++ b/vllm/model_executor/models/glm4.py
+@@ -255,6 +255,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+         self.model = Glm4Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+ 
++        if "text_config" in config:
++            config = config.text_config
+         if get_pp_group().is_last_rank:
+             if config.tie_word_embeddings:
+                 self.lm_head = self.model.embed_tokens
 diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
 index 539381b61..a1e699310 100644
 --- a/vllm/model_executor/models/glm4_1v.py
@@ -12093,6 +12106,19 @@ index 710b805ac..04824db1b 100644
  
              q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
+diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
+index f8ea2111f..866b203a7 100644
+--- a/vllm/model_executor/models/llama.py
++++ b/vllm/model_executor/models/llama.py
+@@ -344,6 +344,8 @@ class LlamaModel(nn.Module):
+         self.quant_config = quant_config
+         lora_vocab = (lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0
++        if "text_config" in config:
++            config = config.text_config
+         self.vocab_size = config.vocab_size + lora_vocab
+         self.org_vocab_size = config.vocab_size
+         if get_pp_group().is_first_rank or (config.tie_word_embeddings
 diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
 index a1c452053..a74e8cdb7 100644
 --- a/vllm/model_executor/models/phi4mm_audio.py