diff --git a/vllm/Miner-U/README.md b/vllm/Miner-U/README.md
index 79e2c86..d79bb03 100644
--- a/vllm/Miner-U/README.md
+++ b/vllm/Miner-U/README.md
@@ -53,3 +53,5 @@ mineru-gradio --server-name 0.0.0.0 --server-port 7860
 ```
 
 Refer to [here](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#_2) for more details.
+
+### Refer to [here](https://github.com/intel/llm-scaler/tree/main/vllm#243-mineru-26-support) for new version 2.6.1 of mineru-vllm, which has performance improvements.
diff --git a/vllm/README.md b/vllm/README.md
index f81ab0c..36e81d2 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -2278,16 +2278,9 @@ TORCH_LLM_ALLREDUCE=1 VLLM_USE_V1=1  CCL_ZE_IPC_EXCHANGE=pidfd VLLM_ALLOW_LONG_M
 
 ---
 
-### 2.4.3 MinerU 2.5 Support
+### 2.4.3 MinerU 2.6 Support
 
-This guide shows how to launch the MinerU 2.5 model using the vLLM inference backend.
-
-#### Install MinerU Core
-
-First, install the core MinerU package:
-```bash
-pip install mineru[core]
-```
+This guide shows how to launch the MinerU 2.6 model using the vLLM inference backend.
 
 #### Start the MinerU Service
 
@@ -2307,7 +2300,10 @@ python3 -m vllm.entrypoints.openai.api_server \
   --trust-remote-code \
   --gpu-memory-util 0.85 \
   --no-enable-prefix-caching \
+  --max-num-batched-tokens=32768 \
+  --max-model-len=32768 \
   --block-size 64 \
+  --max-num-seqs 256 \
   --served-model-name MinerU \
   --tensor-parallel-size 1 \
   --pipeline-parallel-size 1 \
@@ -2328,6 +2324,31 @@ To verify mineru
 mineru -p /llm/MinerU/demo/pdfs/small_ocr.pdf -o ./ -b vlm-http-client -u http://127.0.0.1:8000
 ```
 
+2.Using by gradio
+
+```bash
+mineru-gradio --server-name 0.0.0.0 --server-port 8002
+```
+
+```python
+from gradio_client import Client, handle_file
+
+client = Client("http://localhost:8002/")
+result = client.predict(
+    file_path=handle_file('/llm/MinerU/demo/pdfs/small_ocr.pdf'),
+    end_pages=500,
+    is_ocr=False,
+    formula_enable=True,
+    table_enable=True,
+    language="ch",
+    backend="vlm-http-client",
+    url="http://localhost:8000",
+    api_name="/to_markdown"
+)
+print(result)
+```
+More details you can refer to gradio's [api guide](http://your_ip:8002/?view=api)
+
 ---
 
 ### 2.5 Omni Model Support
@@ -2362,7 +2383,7 @@ python3 -m vllm.entrypoints.openai.api_server \
 
 After starting the vLLM service, you can follow this link to use it
 
-#### [Qwen2.5-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
+#### [Qwen-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
 
 ```bash
 curl http://localhost:8000/v1/chat/completions \
@@ -2383,6 +2404,25 @@ An example responce is listed below:
 ```json
 {"id":"chatcmpl-xxx","object":"chat.completion","model":"Qwen2.5-Omni-7B","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the image is \"TONGYI Qwen\". The sound in the audio is a cough.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":156,"total_tokens":180,"completion_tokens":24,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_transfer_params":null}
 ```
+
+For video input, one can input like this:
+
+```bash
+curl -sS http://localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
+    "model": "Qwen3-Omni-30B-A3B-Instruct",
+    "temperature": 0,
+    "max_tokens": 1024,
+    "messages": [{
+      "role": "user",
+      "content": [
+        { "type": "text", "text": "Please describe the video comprehensively as much as possible." },
+        { "type": "video_url", "video_url": { "url": "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" } }
+      ]
+    }]
+  }'
+```
+
+
 ---
 
 ### 2.6 Data Parallelism (DP)
diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile
index 2ba001c..2018d05 100644
--- a/vllm/docker/Dockerfile
+++ b/vllm/docker/Dockerfile
@@ -16,7 +16,6 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 RUN apt-get update -y && \
     # apt-get install -y software-properties-common && \
     # add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get update -y && \
     apt-get install -y python3.12 python3.12-dev python3-pip && \
     update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
@@ -34,7 +33,6 @@ RUN apt-get update -y && \
         vim \
         linux-libc-dev && \
     # Install Intel GPU runtime packages
-    apt-get update -y && \
     apt-get install -y intel-oneapi-dpcpp-ct=2025.1.0-452 && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
@@ -51,7 +49,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 RUN python3 -m pip config set global.break-system-packages true
 
 # Clone + patch vllm
-RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
+RUN git clone -b v0.10.2 https://github.com/vllm-project/vllm.git && \
     cd vllm && \
     git apply /tmp/vllm_for_multi_arc.patch && \
     pip install --no-cache-dir -r requirements/xpu.txt && \
@@ -59,12 +57,12 @@ RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
     python3 setup.py install
 
 # Clone + patch miner-U
-RUN git clone https://github.com/opendatalab/MinerU.git && \
+RUN git clone -b release-2.6.2 https://github.com/opendatalab/MinerU.git && \
     cd MinerU && \
-    git checkout de41fa58590263e43b783fe224b6d07cae290a33 && \
-    git apply /tmp/miner-u.patch && \
-    pip install -e .[core] && \
-    sed -i 's/select_device(self.args.device, verbose=verbose)/torch.device(self.args.device)/' /usr/local/lib/python3.12/dist-packages/ultralytics/engine/predictor.py
+    pip install -e .[core] --no-deps && \
+    pip install mineru_vl_utils==0.1.14 gradio gradio-client gradio-pdf && \
+    sed -i 's/kwargs.get("max_concurrency", 100)/kwargs.get("max_concurrency", 200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py && \
+    sed -i 's/kwargs.get("http_timeout", 600)/kwargs.get("http_timeout", 1200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py
 
 
 # ======= Add oneCCL build =======
@@ -120,7 +118,9 @@ RUN pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 
 # Pin transformers version to avoid conflict in vLLM
-RUN pip install "transformers<4.54.0"
+RUN pip install "transformers==4.57.0" && \
+    pip install librosa soundfile && \
+    pip install mineru[core]==2.5.4
 
 
 # Set additional environment for production usage
diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index e961e2a..c35c5b4 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -1,5 +1,5 @@
 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
-index b98d42aa7..b2a1ebef2 100644
+index 792f355c4..af2c24c4c 100644
 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
@@ -10,6 +10,107 @@ index b98d42aa7..b2a1ebef2 100644
 +  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,enforce_eager=true,max_num_batched_tokens=4096" \
    --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
    --batch_size "$BATCH_SIZE"
+diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
+index 8c6ef7817..a1de41652 100644
+--- a/.buildkite/release-pipeline.yaml
++++ b/.buildkite/release-pipeline.yaml
+@@ -1,22 +1,24 @@
+ steps:
+   # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+   - label: "Build arm64 wheel - CUDA 12.9"
+-    depends_on: ~
+     id: build-wheel-arm64-cuda-12-9
+     agents:
+       queue: arm64_cpu_queue_postmerge
+     commands:
+       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+       - "mkdir artifacts"
+       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+       - "bash .buildkite/scripts/upload-wheels.sh"
+     env:
+       DOCKER_BUILDKIT: "1"
+ 
++  - block: "Build CUDA 12.8 wheel"
++    key: block-build-cu128-wheel
++
+   - label: "Build wheel - CUDA 12.8"
+-    depends_on: ~
++    depends_on: block-build-cu128-wheel
+     id: build-wheel-cuda-12-8
+     agents:
+       queue: cpu_queue_postmerge
+@@ -28,8 +30,12 @@ steps:
+     env:
+       DOCKER_BUILDKIT: "1"
+ 
+-  - label: "Build wheel - CUDA 12.6"
++  - block: "Build CUDA 12.6 wheel"
++    key: block-build-cu126-wheel
+     depends_on: ~
++
++  - label: "Build wheel - CUDA 12.6"
++    depends_on: block-build-cu126-wheel
+     id: build-wheel-cuda-12-6
+     agents:
+       queue: cpu_queue_postmerge
+@@ -96,6 +102,8 @@ steps:
+     depends_on:
+       - create-multi-arch-manifest
+       - build-wheel-cuda-12-8
++      - build-wheel-cuda-12-6
++      - build-wheel-cuda-12-9
+     id: annotate-release-workflow
+     agents:
+       queue: cpu_queue_postmerge
+diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
+index fde48603a..94e0ac239 100755
+--- a/.buildkite/scripts/annotate-release.sh
++++ b/.buildkite/scripts/annotate-release.sh
+@@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+ To download the wheel:
+ \`\`\`
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+-
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
++aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+ \`\`\`
+ 
+ To download and upload the image:
+ 
+ \`\`\`
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-docker push vllm/vllm-openai:latest-x86_64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-docker push vllm/vllm-openai:latest-aarch64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-
+-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+-docker manifest push vllm/vllm-openai:latest
+-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
++docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
++docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
++docker tag vllm/vllm-openai vllm/vllm-openai:latest
++docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
++docker push vllm/vllm-openai:latest
++docker push vllm/vllm-openai:v${RELEASE_VERSION}
+ \`\`\`
+ EOF 
+\ No newline at end of file
 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
 new file mode 100644
 index 000000000..aef250abe
@@ -388,78 +489,8 @@ index 000000000..eaa2f332a
 +        else
 +          echo "✅ All benchmarks passed"
 +        fi
-diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
-deleted file mode 100644
-index d5c6b8d43..000000000
---- a/.github/workflows/cleanup_pr_body.yml
-+++ /dev/null
-@@ -1,31 +0,0 @@
--name: Cleanup PR Body
--
--on:
--  pull_request_target:
--    types: [opened, reopened, edited]
--
--permissions:
--  pull-requests: write
--
--jobs:
--  update-description:
--    runs-on: ubuntu-latest
--
--    steps:
--      - name: Checkout repository
--        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--
--      - name: Set up Python
--        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
--        with:
--          python-version: '3.12'
--
--      - name: Install Python dependencies
--        run: |
--          python3 -m pip install --upgrade pip
--          python3 -m pip install regex
--
--      - name: Update PR description
--        env:
--          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
-diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
-deleted file mode 100644
-index 16ae1aadb..000000000
---- a/.github/workflows/reminder_comment.yml
-+++ /dev/null
-@@ -1,27 +0,0 @@
--name: PR Reminder Comment Bot
--permissions:
--  pull-requests: write
--on:
--  pull_request_target:
--    types: [opened]
--jobs:
--  pr_reminder:
--    runs-on: ubuntu-latest
--    steps:
--      - name: Remind to run full CI on PR
--        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
--        with:
--          script: |
--            github.rest.issues.createComment({
--              owner: context.repo.owner,
--              repo: context.repo.repo,
--              issue_number: context.issue.number,
--              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
--                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
--                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
--                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
--                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
--                '🚀'
--            })
--        env:
--          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 98ed682fe..5dd6e907c 100644
+index 3f1f9a781..fef10e2ca 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -95,6 +95,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
@@ -474,7 +505,7 @@ index 98ed682fe..5dd6e907c 100644
          return()
      endif()
 diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
-index c7229dbb8..72531f3fc 100644
+index ba7c733be..61a9eeb91 100644
 --- a/benchmarks/backend_request_func.py
 +++ b/benchmarks/backend_request_func.py
 @@ -18,7 +18,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizer
@@ -486,71 +517,8 @@ index c7229dbb8..72531f3fc 100644
  
  
  @dataclass
-diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
-index c597fb106..5bad6645b 100644
---- a/benchmarks/benchmark_serving.py
-+++ b/benchmarks/benchmark_serving.py
-@@ -256,10 +256,11 @@ async def benchmark(
-         raise ValueError(f"Unknown backend: {backend}")
- 
-     print("Starting initial single prompt test run...")
-+    # set test_output_len=10 to avoid long prompt test run
-     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-         input_requests[0].prompt,
-         input_requests[0].prompt_len,
--        input_requests[0].expected_output_len,
-+        10,
-         input_requests[0].multi_modal_data,
-     )
- 
-diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
-index 14461121f..e9b9f0b77 100644
---- a/benchmarks/benchmark_throughput.py
-+++ b/benchmarks/benchmark_throughput.py
-@@ -44,6 +44,7 @@ def run_vllm(
-     n: int,
-     engine_args: EngineArgs,
-     disable_detokenize: bool = False,
-+    do_profile: bool = False,
- ) -> tuple[float, Optional[list[RequestOutput]]]:
-     from vllm import LLM, SamplingParams
- 
-@@ -89,10 +90,14 @@ def run_vllm(
-     outputs = None
-     if not use_beam_search:
-         start = time.perf_counter()
-+        if do_profile:
-+            llm.start_profile()
-         outputs = llm.generate(
-             prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
-         )
-         end = time.perf_counter()
-+        if do_profile:
-+            llm.stop_profile()
-     else:
-         assert lora_requests is None, "BeamSearch API does not support LoRA"
-         prompts = [request.prompt for request in requests]
-@@ -410,6 +415,7 @@ def main(args: argparse.Namespace):
-                 args.n,
-                 EngineArgs.from_cli_args(args),
-                 args.disable_detokenize,
-+                args.profile
-             )
-     elif args.backend == "hf":
-         assert args.tensor_parallel_size == 1
-@@ -647,6 +653,10 @@ def create_argument_parser():
-     parser.add_argument(
-         "--num-prompts", type=int, default=1000, help="Number of prompts to process."
-     )
-+    parser.add_argument("--profile",
-+                        action='store_true',
-+                        default=False,
-+                        help="whether run with profiler.")
-     parser.add_argument(
-         "--hf-max-batch-size",
-         type=int,
 diff --git a/cmake/utils.cmake b/cmake/utils.cmake
-index 621179a70..9e1f4e9c7 100644
+index 9c0ed1d09..a21fb37f1 100644
 --- a/cmake/utils.cmake
 +++ b/cmake/utils.cmake
 @@ -445,7 +445,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
@@ -562,7 +530,7 @@ index 621179a70..9e1f4e9c7 100644
  
    # Add hipify preprocessing step when building with HIP/ROCm.
    if (GPU_LANGUAGE STREQUAL "HIP")
-@@ -487,6 +487,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+@@ -491,6 +491,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  
    target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
  
@@ -7954,76 +7922,41 @@ index 000000000..23f5b805c
 +
 +#endif
 \ No newline at end of file
+diff --git a/docker/Dockerfile b/docker/Dockerfile
+index d4761e84f..307e9658f 100644
+--- a/docker/Dockerfile
++++ b/docker/Dockerfile
+@@ -196,7 +196,6 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
+ 
+ # Flag to control whether to use pre-built vLLM wheels
+ ARG VLLM_USE_PRECOMPILED=""
+-ARG VLLM_MAIN_CUDA_VERSION=""
+ 
+ # if USE_SCCACHE is set, use sccache to speed up compilation
+ RUN --mount=type=cache,target=/root/.cache/uv \
+@@ -214,7 +213,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
+         && export SCCACHE_IDLE_TIMEOUT=0 \
+         && export CMAKE_BUILD_TYPE=Release \
+         && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+-        && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
+         && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+         && sccache --show-stats \
+         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
 diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
-index 7d5a589eb..25a9fd7cd 100644
+index ef4223525..ffa7c6ea7 100644
 --- a/docker/Dockerfile.xpu
 +++ b/docker/Dockerfile.xpu
-@@ -1,9 +1,10 @@
--# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
--FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
-+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
- 
--RUN rm /etc/apt/sources.list.d/intel-graphics.list
-+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-+    add-apt-repository -y ppa:kobuk-team/intel-graphics
- 
--RUN apt-get update -y && \
-+RUN apt clean && apt-get update -y && \
-     apt-get install -y --no-install-recommends --fix-missing \
-     curl \
-     ffmpeg \
-@@ -14,15 +15,29 @@ RUN apt-get update -y && \
-     libgl1 \
-     lsb-release \
-     numactl \
--    python3 \
--    python3-dev \
--    python3-pip \
--    wget
-+    wget \
-+    vim \
-+    python3.12 \
-+    python3.12-dev \
-+    python3-pip
-+
-+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
-+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
-+
-+RUN apt install -y libze1=1.23.1-1~24.04~ppa1 libze-dev=1.23.1-1~24.04~ppa1 libze-intel-gpu1=25.27.34303.9-1~24.04~ppa1 intel-opencl-icd=25.27.34303.9-1~24.04~ppa1 libze-intel-gpu-raytracing=1.1.0-114~u24.04
-+
-+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
-+SHELL ["bash", "-c"]
-+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
- 
- WORKDIR /workspace/vllm
- COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
- COPY requirements/common.txt /workspace/vllm/requirements/common.txt
- 
-+# suppress the python externally managed environment error
-+RUN python3 -m pip config set global.break-system-packages true
-+
- RUN --mount=type=cache,target=/root/.cache/pip \
-     pip install --no-cache-dir \
-     -r requirements/xpu.txt
-@@ -47,10 +62,11 @@ FROM vllm-base AS vllm-openai
+@@ -62,7 +62,7 @@ FROM vllm-base AS vllm-openai
  
  # install additional dependencies for openai api server
  RUN --mount=type=cache,target=/root/.cache/pip \
 -    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
 +    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] 'modelscope!=1.15.0'
-+
-+RUN --mount=type=cache,target=/root/.cache/pip \
-+    pip uninstall oneccl oneccl-devel -y
  
--ENV VLLM_USAGE_SOURCE production-docker-image \
--    TRITON_XPU_PROFILE 1
- # install development dependencies (for testing)
- RUN python3 -m pip install -e tests/vllm_test_utils
- ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ RUN --mount=type=cache,target=/root/.cache/pip \
+     pip uninstall oneccl oneccl-devel -y
 diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
-index 0661933ac..469d88a05 100644
+index 834c03cbe..439e1e0d7 100644
 --- a/docs/features/quantization/fp8.md
 +++ b/docs/features/quantization/fp8.md
 @@ -134,4 +134,4 @@ print(result[0].outputs[0].text)
@@ -8033,210 +7966,173 @@ index 0661933ac..469d88a05 100644
 -    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
 +    Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device.
 diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
-index c8b6c6c86..404045306 100644
+index db3dd2c25..7d3577b14 100644
 --- a/docs/models/supported_models.md
 +++ b/docs/models/supported_models.md
-@@ -592,7 +592,8 @@ Specified using `--task generate`.
- | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
- | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
- | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
--| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
- | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
- | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-@@ -602,7 +603,7 @@ Specified using `--task generate`.
- | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
- | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
- | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
--| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
-+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
- | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
- | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
-@@ -646,6 +647,15 @@ Specified using `--task generate`.
+@@ -340,6 +340,7 @@ th {
+ | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
++| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
+ | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
+ | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+@@ -667,6 +668,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
+ | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ |
+ | `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
+ | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
+ | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+@@ -757,8 +761,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
+     Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
  
-     This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
- 
-+!!! note
-+    `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
-+    MobileNet-v5 vision backbone.
-+  
-+    Performance is not yet fully optimized mainly due to:
-+  
-+    - Both audio and vision MM encoders use `transformers.AutoModel` implementation.  
-+    - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
-+
  !!! note
-     Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+-    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+-    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
++    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
  
-diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
+ #### Transcription
+ 
+diff --git a/examples/bmg/reasoning.py b/examples/bmg/reasoning.py
 new file mode 100644
-index 000000000..aec3481d2
+index 000000000..04f91786e
 --- /dev/null
-+++ b/examples/offline_inference/basic/reward.py
-@@ -0,0 +1,55 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++++ b/examples/bmg/reasoning.py
+@@ -0,0 +1,27 @@
++from openai import OpenAI
 +
-+from argparse import Namespace
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://0.0.0.0:8000/v1"
 +
-+from vllm import LLM, EngineArgs
-+from vllm.utils import FlexibleArgumentParser
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
 +
++models = client.models.list()
++model = models.data[0].id
 +
-+def parse_args():
-+    parser = FlexibleArgumentParser()
-+    parser = EngineArgs.add_cli_args(parser)
-+    # Set example specific arguments
-+    parser.set_defaults(
-+        model="internlm/internlm2-1_8b-reward",
-+        #runner="pooling",
-+        task="reward",
-+        enforce_eager=True,
-+        max_model_len=1024,
-+        trust_remote_code=True,
-+    )
-+    return parser.parse_args()
++# Round 1
++messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
++# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
++# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
++# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
++response = client.chat.completions.create(model=model, messages=messages)
 +
++reasoning_content = response.choices[0].message.reasoning_content
++content = response.choices[0].message.content
 +
-+def main(args: Namespace):
-+    # Sample prompts.
-+    prompts = [
-+        "Hello, my name is",
-+        "The president of the United States is",
-+        "The capital of France is",
-+        "The future of AI is",
-+    ]
++print("reasoning_content:", reasoning_content)
++print("content:", content)
 +
-+    # Create an LLM.
-+    # You should pass runner="pooling" for reward models
-+    llm = LLM(**vars(args))
+diff --git a/examples/bmg/tooling.py b/examples/bmg/tooling.py
+new file mode 100644
+index 000000000..bf8375831
+--- /dev/null
++++ b/examples/bmg/tooling.py
+@@ -0,0 +1,37 @@
++import json
 +
-+    # Generate rewards. The output is a list of PoolingRequestOutput.
-+    outputs = llm.reward(prompts)
++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
++
++def get_weather(location: str, unit: str):
++    return f"Getting the weather for {location} in {unit}..."
++tool_functions = {"get_weather": get_weather}
++
++tools = [{
++    "type": "function",
++    "function": {
++        "name": "get_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
++                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
++            },
++            "required": ["location", "unit"]
++        }
++    }
++}]
++
++response = client.chat.completions.create(
++    model=client.models.list().data[0].id,
++    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
++    tools=tools,
++    temperature=0,
++    tool_choice="auto"
++)
 +
-+    # Print the outputs.
-+    print("\nGenerated Outputs:\n" + "-" * 60)
-+    for prompt, output in zip(prompts, outputs):
-+        rewards = output.outputs.data
-+        rewards_trimmed = (
-+            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
-+        )
-+        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
-+        print("-" * 60)
-+
-+
-+if __name__ == "__main__":
-+    args = parse_args()
-+    main(args)
-+
-diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
-index f0c00bcaa..c8fa36295 100644
---- a/examples/offline_inference/multilora_inference.py
-+++ b/examples/offline_inference/multilora_inference.py
-@@ -30,7 +30,7 @@ def create_test_prompts(
-         (
-             "A robot may not injure a human being",
-             SamplingParams(
--                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-+                temperature=0.0, logprobs=1, max_tokens=128
-             ),
-             None,
-         ),
-@@ -46,7 +46,7 @@ def create_test_prompts(
-             SamplingParams(
-                 temperature=0.0,
-                 logprobs=1,
--                prompt_logprobs=1,
-+                #prompt_logprobs=1,
-                 max_tokens=128,
-                 stop_token_ids=[32003],
-             ),
-@@ -57,7 +57,7 @@ def create_test_prompts(
-             SamplingParams(
-                 temperature=0.0,
-                 logprobs=1,
--                prompt_logprobs=1,
-+                #prompt_logprobs=1,
-                 max_tokens=128,
-                 stop_token_ids=[32003],
-             ),
-@@ -99,14 +99,14 @@ def initialize_engine() -> LLMEngine:
-     #   numbers will cause higher memory usage. If you know that all LoRAs will
-     #   use the same rank, it is recommended to set this as low as possible.
-     # max_cpu_loras: controls the size of the CPU LoRA cache.
--    engine_args = EngineArgs(
--        model="meta-llama/Llama-2-7b-hf",
--        enable_lora=True,
--        max_loras=1,
--        max_lora_rank=8,
--        max_cpu_loras=2,
--        max_num_seqs=256,
--    )
-+    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
-+                             enable_lora=True,
-+                             max_loras=1,
-+                             max_lora_rank=8,
-+                             max_cpu_loras=2,
-+                             max_num_seqs=256,
-+                             enforce_eager=True,
-+                             block_size=64)
-     return LLMEngine.from_engine_args(engine_args)
- 
- 
-diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
-index 4fdc7a3cf..b6007b9f4 100644
---- a/examples/offline_inference/prithvi_geospatial_mae.py
-+++ b/examples/offline_inference/prithvi_geospatial_mae.py
-@@ -3,12 +3,12 @@
- import argparse
- import datetime
- import os
--import re
- from typing import Union
- 
- import albumentations
- import numpy as np
- import rasterio
-+import regex as re
- import torch
- from einops import rearrange
- from terratorch.datamodules import Sen1Floods11NonGeoDataModule
++tool_call = response.choices[0].message.tool_calls[0].function
++print(f"Function called: {tool_call.name}")
++print(f"Arguments: {tool_call.arguments}")
++print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}")                                                                               30,22         Bot
++
+diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
+index 36d805a32..2a4233b6a 100644
+--- a/examples/offline_inference/data_parallel.py
++++ b/examples/offline_inference/data_parallel.py
+@@ -96,6 +96,13 @@ def parse_args():
+         "--quantization",
+         type=str,
+     )
++    parser.add_argument(
++        "--disable-expert-parallel",
++        dest="enable_expert_parallel",
++        action="store_false",
++        help="Disable expert parallel (default: enabled).",
++    )
++    parser.set_defaults(enable_expert_parallel=True)
+     return parser.parse_args()
+ 
+ 
+@@ -108,6 +115,7 @@ def main(
+     dp_master_port,
+     GPUs_per_dp_rank,
+     enforce_eager,
++    enable_expert_parallel,
+     trust_remote_code,
+     max_num_seqs,
+     max_model_len,
+@@ -162,7 +170,7 @@ def main(
+         model=model,
+         tensor_parallel_size=GPUs_per_dp_rank,
+         enforce_eager=enforce_eager,
+-        enable_expert_parallel=True,
++        enable_expert_parallel=enable_expert_parallel,
+         trust_remote_code=trust_remote_code,
+         max_num_seqs=max_num_seqs,
+         max_model_len=max_model_len,
+@@ -222,6 +230,7 @@ if __name__ == "__main__":
+                 dp_master_port,
+                 tp_size,
+                 args.enforce_eager,
++                args.enable_expert_parallel,
+                 args.trust_remote_code,
+                 args.max_num_seqs,
+                 args.max_model_len,
 diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
-index e4811c023..fe4393bcf 100644
+index b104113b8..58fb423e8 100644
 --- a/examples/offline_inference/vision_language.py
 +++ b/examples/offline_inference/vision_language.py
-@@ -389,6 +389,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+@@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
      )
  
  
-+# Intern-S1
-+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-+    model_name = "internlm/Intern-S1"
++# Dots-OCR
++def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
++    assert modality == "image"
 +
++    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
 +    engine_args = EngineArgs(
-+        model=model_name,
-+        trust_remote_code=True,
-+        max_model_len=8192,
-+        max_num_seqs=2,
++        model="rednote-hilab/dots.ocr",
 +        limit_mm_per_prompt={modality: 1},
-+        enforce_eager=True,
-+    )
-+
-+    if modality == "image":
-+        placeholder = "<IMG_CONTEXT>"
-+    elif modality == "video":
-+        placeholder = "<video>"
-+
-+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-+    messages = [
-+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-+        for question in questions
-+    ]
-+    prompts = tokenizer.apply_chat_template(
-+        messages, tokenize=False, add_generation_prompt=True
++        trust_remote_code=True,
 +    )
 +
 +    return ModelRequestData(
@@ -8245,10 +8141,10 @@ index e4811c023..fe4393bcf 100644
 +    )
 +
 +
- # InternVL
- def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
-     model_name = "OpenGVLab/InternVL3-2B"
-@@ -1080,7 +1113,9 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
+ def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
+     assert modality == "image"
+ 
+@@ -1431,7 +1448,9 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
              "max_pixels": 1280 * 28 * 28,
              "fps": 1,
          },
@@ -8259,11 +8155,125 @@ index e4811c023..fe4393bcf 100644
      )
  
      if modality == "image":
+@@ -1497,6 +1516,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
+     )
+ 
+ 
++# Qwen3-VL-Dense
++def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
++    model_name = "Qwen/Qwen3-VL-4B-Instruct"
++
++    engine_args = EngineArgs(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=5,
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++            "fps": 1,
++        },
++        limit_mm_per_prompt={modality: 1},
++    )
++
++    if modality == "image":
++        placeholder = "<|image_pad|>"
++    elif modality == "video":
++        placeholder = "<|video_pad|>"
++
++    prompts = [
++        (
++            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
++            f"{question}<|im_end|>\n"
++            "<|im_start|>assistant\n"
++        )
++        for question in questions
++    ]
++
++    return ModelRequestData(
++        engine_args=engine_args,
++        prompts=prompts,
++    )
++
++
++# Qwen3-VL-MOE
++def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
++    model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
++
++    engine_args = EngineArgs(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=5,
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++            "fps": 1,
++        },
++        limit_mm_per_prompt={modality: 1},
++    )
++
++    if modality == "image":
++        placeholder = "<|image_pad|>"
++    elif modality == "video":
++        placeholder = "<|video_pad|>"
++
++    prompts = [
++        (
++            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
++            f"{question}<|im_end|>\n"
++            "<|im_start|>assistant\n"
++        )
++        for question in questions
++    ]
++
++    return ModelRequestData(
++        engine_args=engine_args,
++        prompts=prompts,
++    )
++
++
+ # R-4B
+ def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
+     assert modality == "image"
+@@ -1662,6 +1755,7 @@ model_example_map = {
+     "aya_vision": run_aya_vision,
+     "blip-2": run_blip2,
+     "chameleon": run_chameleon,
++    "dots_ocr": run_dots_ocr,
+     "command_a_vision": run_command_a_vision,
+     "deepseek_vl_v2": run_deepseek_vl2,
+     "ernie45_vl": run_ernie45_vl,
+@@ -1707,6 +1801,8 @@ model_example_map = {
+     "qwen2_vl": run_qwen2_vl,
+     "qwen2_5_vl": run_qwen2_5_vl,
+     "qwen2_5_omni": run_qwen2_5_omni,
++    "qwen3_vl": run_qwen3_vl,
++    "qwen3_vl_moe": run_qwen3_vl_moe,
+     "rvl": run_r_vl,
+     "skywork_chat": run_skyworkr1v,
+     "smolvlm": run_smolvlm,
+@@ -1716,6 +1812,15 @@ model_example_map = {
+ }
+ 
+ 
++MODELS_NEED_VIDEO_METADATA = [
++    "glm4_1v",
++    "glm4_5v",
++    "glm4_5v_fp8",
++    "qwen3_vl",
++    "qwen3_vl_moe",
++]
++
++
+ def get_multi_modal_input(args):
+     """
+     return {
 diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
-index eb4f3b6c8..ca31dc3e0 100644
+index 01c2905cf..2649c992b 100644
 --- a/examples/offline_inference/vision_language_multi_image.py
 +++ b/examples/offline_inference/vision_language_multi_image.py
-@@ -799,12 +799,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+@@ -982,12 +982,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
          )
          smart_resize = None
  
@@ -8280,55 +8290,78 @@ index eb4f3b6c8..ca31dc3e0 100644
          limit_mm_per_prompt={"image": len(image_urls)},
      )
  
-diff --git a/requirements/tpu.txt b/requirements/tpu.txt
-index 354771482..d86f643d3 100644
---- a/requirements/tpu.txt
-+++ b/requirements/tpu.txt
-@@ -10,6 +10,7 @@ jinja2>=3.1.6
- ray[default]
- ray[data]
- setuptools==78.1.0
-+nixl==0.3.0
- 
- # Install torch_xla
- --pre
+diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
+index 2a8f46372..990b47f22 100644
+--- a/examples/online_serving/structured_outputs/structured_outputs.py
++++ b/examples/online_serving/structured_outputs/structured_outputs.py
+@@ -225,7 +225,7 @@ async def cli():
+     )
+     args = parser.parse_args()
+ 
+-    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
++    base_url = os.getenv("OPENAI_BASE_URL", "http://0.0.0.0:8000/v1")
+     client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+     constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+     model = (await client.models.list()).data[0].id
+@@ -236,6 +236,7 @@ async def cli():
+                 client.chat.completions.create(
+                     model=model,
+                     max_tokens=1024,
++                    temperature=0,
+                     stream=True,
+                     **PARAMS[name],
+                 )
+@@ -250,6 +251,7 @@ async def cli():
+                 client.chat.completions.create(
+                     model=model,
+                     max_tokens=1024,
++                    temperature=0,
+                     stream=False,
+                     **PARAMS[name],
+                 )
+diff --git a/requirements/common.txt b/requirements/common.txt
+index b8665104b..a52745f69 100644
+--- a/requirements/common.txt
++++ b/requirements/common.txt
+@@ -24,7 +24,7 @@ outlines_core == 0.2.11
+ # required for outlines backend disk cache
+ diskcache == 5.6.3
+ lark == 1.2.2
+-xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
++xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+ typing_extensions >= 4.10
+ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
+ partial-json-parser # used for parsing partial JSON outputs
 diff --git a/requirements/xpu.txt b/requirements/xpu.txt
-index 0d95dc571..170c09928 100644
+index 74f5b05b2..c0203a754 100644
 --- a/requirements/xpu.txt
 +++ b/requirements/xpu.txt
-@@ -11,14 +11,10 @@ jinja2>=3.1.6
+@@ -11,9 +11,10 @@ jinja2>=3.1.6
  datasets # for benchmark scripts
  numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
- 
--torch==2.7.0+xpu
+ nixl==0.3.0 # for PD disaggregation
 +
-+torch == 2.8.0
+ torch==2.8.0+xpu
  torchaudio
  torchvision
--pytorch-triton-xpu
  --extra-index-url=https://download.pytorch.org/whl/xpu
  
--# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
--# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
--intel-extension-for-pytorch==2.7.10+xpu
--oneccl_bind_pt==2.7.0+xpu
----extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
-diff --git a/run_benchmark_with_profile.sh b/run_benchmark_with_profile.sh
-new file mode 100644
-index 000000000..fe4dbc268
---- /dev/null
-+++ b/run_benchmark_with_profile.sh
-@@ -0,0 +1,3 @@
-+export VLLM_TORCH_PROFILER_DIR=$PWD/profile
-+
-+VLLM_USE_V1=1 python3 benchmarks/benchmark_throughput.py --model facebook/opt-125m --dataset_name random  --enforce-eager --max-num-seqs 32 --gpu-memory-util 0.8 --num-prompts 16 --max-model-len 2000 --input-len 1024 --output-len 10 --max-num-batched-tokens 32768  --disable-sliding-window --dtype float16 --profile
-\ No newline at end of file
+-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
++intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
 diff --git a/setup.py b/setup.py
-index d46e678e7..9951f49bc 100644
+index 67f65d9b9..d2bfb2880 100644
 --- a/setup.py
 +++ b/setup.py
-@@ -148,6 +148,7 @@ class cmake_build_ext(build_ext):
+@@ -56,6 +56,8 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
+     # fallback to cpu
+     VLLM_TARGET_DEVICE = "cpu"
+ 
++MAIN_CUDA_VERSION = "12.8"
++
+ 
+ def is_sccache_available() -> bool:
+     return which("sccache") is not None and \
+@@ -148,6 +150,7 @@ class cmake_build_ext(build_ext):
          cmake_args = [
              '-DCMAKE_BUILD_TYPE={}'.format(cfg),
              '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
@@ -8336,7 +8369,7 @@ index d46e678e7..9951f49bc 100644
          ]
  
          verbose = envs.VERBOSE
-@@ -442,7 +443,7 @@ def _is_xpu() -> bool:
+@@ -432,7 +435,7 @@ def _is_xpu() -> bool:
  
  
  def _build_custom_ops() -> bool:
@@ -8345,445 +8378,207 @@ index d46e678e7..9951f49bc 100644
  
  
  def get_rocm_version():
-diff --git a/tests/conftest.py b/tests/conftest.py
-index a18dbf58c..fd4956bdb 100644
---- a/tests/conftest.py
-+++ b/tests/conftest.py
-@@ -1062,8 +1062,17 @@ class VllmRunner:
-         return [req_output.outputs.score for req_output in req_outputs]
- 
-     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
--        executor = self.llm.llm_engine.model_executor
--        return executor.apply_model(func)
-+        if hasattr(self.llm.llm_engine, "model_executor"):
-+            # This works either in V0 or in V1 with
-+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
-+            executor = self.llm.llm_engine.model_executor
-+            return executor.apply_model(func)
-+
-+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
-+        def _apply_model(self):
-+            return func(self.get_model())
-+
-+        return self.llm.llm_engine.collective_rpc(_apply_model)
- 
-     def __enter__(self):
-         return self
-diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
-index aae9a4d1e..667a63e76 100644
---- a/tests/model_executor/test_model_load_with_params.py
-+++ b/tests/model_executor/test_model_load_with_params.py
-@@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_model_loading_with_params(vllm_runner):
-+def test_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test parameter weight loading with tp>1.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     with vllm_runner(model_name=MODEL_NAME,
-                      revision=REVISION,
-                      dtype="float16",
-@@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_roberta_model_loading_with_params(vllm_runner):
-+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test parameter weight loading with tp>1.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
-                      revision=REVISION_ROBERTA,
-                      dtype="float16",
-@@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
- 
- @pytest.mark.skipif(current_platform.is_rocm(),
-                     reason="Xformers backend is not supported on ROCm.")
--def test_facebook_roberta_model_loading_with_params(vllm_runner):
-+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
-     """
-     Test loading roberta-base model with no lm_head.
-     """
-+    # to use apply_model
-+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-     model_name = "FacebookAI/roberta-base"
-     with vllm_runner(model_name=model_name,
-                      dtype="float16",
-diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
-index cc9e4102d..ba42e389f 100644
---- a/tests/models/language/pooling/test_embedding.py
-+++ b/tests/models/language/pooling/test_embedding.py
-@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
-         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
-         # [Encoder-only]
--        pytest.param(
--            "BAAI/bge-base-en-v1.5",
--            marks=[
--                # CPU only supports V1
--                pytest.mark.core_model,
--                pytest.mark.skip_v1
--            ]),
--        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
--                     marks=[pytest.mark.skip_v1]),
--        pytest.param("intfloat/multilingual-e5-small",
--                     marks=[pytest.mark.skip_v1]),
-+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
-+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-+        pytest.param("intfloat/multilingual-e5-small"),
-         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                      marks=[pytest.mark.skip_v1]),
-         # [Cross-Encoder]
-diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
-index 16c711407..a4681baa5 100644
---- a/tests/models/language/pooling/test_jina.py
-+++ b/tests/models/language/pooling/test_jina.py
-@@ -23,6 +23,14 @@ RERANK_MODELS = [
+@@ -505,7 +508,7 @@ def get_vllm_version() -> str:
+             version += f"{sep}precompiled"
+         else:
+             cuda_version = str(get_nvcc_cuda_version())
+-            if cuda_version != envs.VLLM_MAIN_CUDA_VERSION:
++            if cuda_version != MAIN_CUDA_VERSION:
+                 cuda_version_str = cuda_version.replace(".", "")[:3]
+                 # skip this for source tarball, required for pypi
+                 if "sdist" not in sys.argv:
+@@ -513,7 +516,7 @@ def get_vllm_version() -> str:
+     elif _is_hip():
+         # Get the Rocm Version
+         rocm_version = get_rocm_version() or torch.version.hip
+-        if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
++        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+             version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+     elif _is_tpu():
+         version += f"{sep}tpu"
+diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
+index 29a3b40d2..72819f31d 100644
+--- a/tests/entrypoints/openai/test_vision.py
++++ b/tests/entrypoints/openai/test_vision.py
+@@ -34,11 +34,11 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
+     ],
+     [
+         "The image shows a Venn diagram with three over",
+-        "The image shows a Venn diagram with three intersect",
++        "This image shows a Venn diagram with three over",
+     ],
+     [
+         "This image displays a gradient of colors ranging from",
+-        "The image displays a gradient of colors ranging from",
++        "This image displays a gradient of colors forming a spectrum",
+     ],
  ]
  
- 
-+@pytest.fixture(autouse=True)
-+def v1(run_with_both_engines):
-+    # Simple autouse wrapper to run both engines for each test
-+    # This can be promoted up to conftest.py to run for every
-+    # test in a package
-+    pass
-+
-+
- @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
- def test_embed_models_mteb(hf_runner, vllm_runner,
-                            model_info: EmbedModelInfo) -> None:
+diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
+index c01ea3299..d37b968ed 100644
+--- a/tests/kernels/attention/test_mha_attn.py
++++ b/tests/kernels/attention/test_mha_attn.py
+@@ -36,31 +36,52 @@ def test_mha_attn_platform(device: str):
+     torch.set_default_dtype(torch.float16)
+ 
+     if device == "cpu":
+-        with patch("vllm.attention.selector.current_platform",
+-                   CpuPlatform()), \
+-             patch("vllm.platforms.current_platform", CpuPlatform()):
++        with patch("vllm.attention.layer.current_platform", CpuPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CpuPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+-            assert attn.attn_backend == _Backend.TORCH_SDPA_VLLM_V1
++            assert attn.attn_backend == _Backend.TORCH_SDPA
+     elif device == "hip":
+-        with patch("vllm.attention.selector.current_platform",
+-                   RocmPlatform()), \
+-             patch("vllm.platforms.current_platform", RocmPlatform()), \
+-             patch("vllm.attention.layer.current_platform", RocmPlatform()):
++        with patch("vllm.attention.layer.current_platform", RocmPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   RocmPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+             assert attn.attn_backend == _Backend.TORCH_SDPA
+     else:
+-        with patch("vllm.attention.selector.current_platform",
+-                   CudaPlatform()), \
+-             patch("vllm.platforms.current_platform", CudaPlatform()):
++        # Test CUDA with head_size=64 (divisible by 32)
++        # - should use vLLM's FlashAttention
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CudaPlatform()):
+             attn = MultiHeadAttention(16, 64, scale=1)
+-            assert attn.attn_backend == _Backend.XFORMERS
++            assert attn.attn_backend == _Backend.FLASH_ATTN
+ 
+-        with patch("vllm.attention.selector.current_platform",
++        # Test CUDA with head_size=72 (not divisible by 32)
++        # - with upstream FA not available
++        # - should use xformers
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
+                    CudaPlatform()), \
+-             patch("vllm.platforms.current_platform", CudaPlatform()):
++             patch("vllm.attention.layer.check_upstream_fa_availability",
++                   return_value=False):
+             attn = MultiHeadAttention(16, 72, scale=1)
+             assert attn.attn_backend == _Backend.XFORMERS
+ 
++        # Test CUDA with head_size=72 (not divisible by 32)
++        # - with upstream FA available
++        # - should use upstream FA
++        with patch("vllm.attention.layer.current_platform", CudaPlatform()), \
++             patch("vllm.model_executor.models.vision.current_platform",
++                   CudaPlatform()), \
++             patch("vllm.attention.layer.check_upstream_fa_availability",
++                   return_value=True), \
++             patch.dict('sys.modules', {'flash_attn': type('MockFlashAttn', (),
++                                                           {
++                 'flash_attn_varlen_func': lambda *args, **kwargs: None
++             })()}):
++            attn = MultiHeadAttention(16, 72, scale=1)
++            assert attn.attn_backend == _Backend.FLASH_ATTN
++
+ 
+ def ref_attention(
+     query: torch.Tensor,
 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
-index fd5842523..321a156a6 100644
+index ced0ab337..404854f54 100644
 --- a/tests/models/multimodal/processing/test_common.py
 +++ b/tests/models/multimodal/processing/test_common.py
-@@ -272,10 +272,15 @@ def _test_processing_correctness_one(
-     "THUDM/GLM-4.1V-9B-Thinking",
-     "ibm-granite/granite-speech-3.3-2b",
-     "h2oai/h2ovl-mississippi-800m",
-+    "internlm/Intern-S1",
-     "OpenGVLab/InternVL2-1B",
-     "OpenGVLab/InternVL3-1B",
-     "HuggingFaceM4/Idefics3-8B-Llama3",
-     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-+    "OpenGVLab/InternVL3_5-1B",
-+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
-+    "OpenGVLab/InternVL3_5-30B-A3B",
-+    "Kwai-Keye/Keye-VL-8B-Preview",
-     "moonshotai/Kimi-VL-A3B-Instruct",
-     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-     "llava-hf/llava-1.5-7b-hf",
-diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
-new file mode 100644
-index 000000000..2d8cd49ed
---- /dev/null
-+++ b/tests/models/multimodal/processing/test_tensor_schema.py
-@@ -0,0 +1,264 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+from collections.abc import Iterable
-+from functools import partial
-+from typing import Any, Union
-+from unittest.mock import patch
+@@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+     """
+     # Ensure video metadata is included
+     if "video" in mm_data:
++        # GLM4.1V doesn't support multiple videos
+         video = mm_data["video"]
+         mm_data["video"] = (video, {
+             "total_num_frames": len(video),
+@@ -41,6 +42,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+     return mm_data
+ 
+ 
++def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
++    """
++    Patch the multimodal data for Qwen3-VL model.
++    """
 +
-+import numpy as np
-+import pytest
-+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-+                                                       UserMessage)
-+from mistral_common.protocol.instruct.request import ChatCompletionRequest
-+from PIL import Image
-+
-+from vllm.config import ModelConfig
-+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
-+from vllm.inputs import InputProcessingContext
-+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-+                             MultiModalKwargs)
-+from vllm.multimodal.processing import BaseMultiModalProcessor
-+from vllm.multimodal.utils import group_mm_kwargs_by_modality
-+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-+from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
-+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-+from vllm.v1.engine.core import EngineCore as V1EngineCore
-+
-+from ....conftest import VllmRunner
-+from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
-+from ...utils import dummy_hf_overrides
-+
-+ARCH_TO_SKIP = {
-+    "MolmoForCausalLM": "incompatible requirements",
-+}
-+ARCH_NEEDS_EXTRAS = [
-+    "InternVLChatModel",
-+    "Idefics3ForConditionalGeneration",
-+    "LlavaForConditionalGeneration",
-+    "MiniCPMV",
-+    "PaliGemmaForConditionalGeneration",
-+]
-+REPO_ID_TO_SKIP = {
-+    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-+    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
-+    # after support PP for GPT-OSS
-+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
-+}
++    def create_metadata(frames: np.ndarray):
++        num_frames = len(frames)
++        return {
++            "total_num_frames": num_frames,
++            "fps": 2.0,
++            "duration": num_frames / 2.0,
++            "video_backend": "opencv",
++            "frames_indices": list(range(num_frames)),
++            "do_sample_frames": True,
++        }
 +
-+ImageInput = list[Image.Image]
-+VideoInput = Union[list[Image.Image], list[np.ndarray],
-+                   list[tuple[np.ndarray, dict[str, Any]]]]
-+AudioInput = list[tuple[np.ndarray, int]]
-+
-+
-+def _resize_data(_data: Union[Image.Image, np.ndarray],
-+                 size_factor: float) -> Union[Image.Image, np.ndarray]:
-+    assert size_factor <= 1, "Size factor must be less than 1"
-+    # Image input
-+    if isinstance(_data, Image.Image):
-+        W, H = _data.width, _data.height
-+        W, H = map(lambda x: int(x * size_factor), (W, H))
-+        return _data.resize((W, H))
-+    # Video input with PIL Images
-+    elif is_list_of(_data, Image.Image):
-+        W, H = next(iter(_data)).width, next(iter(_data)).height
-+        T = len(_data)
-+        T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
-+        return [d.resize((W, H)) for d in _data[:T]]
-+    # Video input with numpy arrays
-+    elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
-+        T, H, W, C = _data.shape[-4:]
-+        T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
-+        return _data[..., :T, :H, :W, :C]
-+    # Audio input
-+    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
-+        return _data[:int(len(_data) * size_factor)]
-+    raise AssertionError("This line should be unreachable.")
-+
-+
-+def resize_mm_data(
-+    data: Union[ImageInput, VideoInput, AudioInput],
-+    size_factors: tuple[float,
-+                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
-+    size_factors = size_factors[:len(data)]
-+    if is_list_of(data, (Image.Image, np.ndarray, list)):
-+        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
-+    elif is_list_of(data, tuple):
-+        return [(_resize_data(d, s), meta)
-+                for (d, meta), s in zip(data, size_factors)]
-+    raise ValueError("Unsupported multimodal data type.")
-+
-+
-+def create_batched_mm_kwargs(
-+    model_config: ModelConfig,
-+    processor: BaseMultiModalProcessor,
-+    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
-+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
-+    processing_info = processor.info
-+    dummy_inputs = processor.dummy_inputs
-+    supported_mm_limits = processing_info.get_supported_mm_limits()
-+    mm_counts = {
-+        modality: 3 if limit is None else limit
-+        for modality, limit in supported_mm_limits.items()
-+    }
-+    processor_inputs = dummy_inputs.get_dummy_processor_inputs(
-+        seq_len=model_config.max_model_len,
-+        mm_counts=mm_counts,
-+    )
-+    mm_data = processor_inputs.mm_data
-+    resized_mm_data = {
-+        modality: resize_mm_data(data, size_factors)
-+        for modality, data in mm_data.items()
-+    }
-+    # Mistral chat outputs tokens directly, rather than text prompts
-+    if model_config.tokenizer_mode == "mistral":
-+        images = resized_mm_data.get("image", [])
-+        request = ChatCompletionRequest(messages=[
-+            UserMessage(content=[
-+                TextChunk(text=""),
-+                *(ImageChunk(image=image) for image in images),
-+            ]),
-+        ])
-+        tokenizer = processing_info.get_tokenizer()
-+        res = tokenizer.mistral.encode_chat_completion(request)
-+        prompt = res.tokens
-+    else:
-+        prompt = processor_inputs.prompt
-+    mm_kwargs = processor.apply(
-+        prompt=prompt,
-+        mm_data=resized_mm_data,
-+        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-+        tokenization_kwargs=processor_inputs.tokenization_kwargs,
-+    )["mm_kwargs"]
-+    items = [
-+        item for modality in supported_mm_limits
-+        for item in mm_kwargs[modality]
-+    ]
-+    return group_mm_kwargs_by_modality(items)
-+
-+
-+def get_model_id_to_test(
-+        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
-+    filtered_results = []
-+    for model_arch in model_arch_list:
-+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-+        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
-+            available_repos = list(
-+                map(lambda model_id: (model_arch, model_id),
-+                    [model_info.default, *model_info.extras.values()]))
-+            filtered_results.extend(available_repos)
++    # Ensure video metadata is included
++    if "video" in mm_data:
++        video = mm_data["video"]
++        if isinstance(video, list):
++            # multiple videos
++            mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
 +        else:
-+            filtered_results.append((model_arch, model_info.default))
-+    return filtered_results
-+
-+
-+@pytest.mark.parametrize(
-+    "model_arch, model_id",
-+    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-+def test_model_tensor_schema(model_arch: str, model_id: str,
-+                             vllm_runner: type[VllmRunner], monkeypatch):
-+    if model_arch in ARCH_TO_SKIP:
-+        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
-+    if model_id in REPO_ID_TO_SKIP:
-+        pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}")
-+
-+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-+    model_info.check_available_online(on_fail="skip")
-+    model_info.check_transformers_version(on_fail="skip",
-+                                          check_max_version=False)
-+
-+    hf_overrides_fn = partial(dummy_hf_overrides,
-+                              model_arch=model_arch,
-+                              exist_overrides=model_info.hf_overrides)
-+
-+    model_config = ModelConfig(
-+        model_id,
-+        tokenizer=model_info.tokenizer or model_id,
-+        tokenizer_mode=model_info.tokenizer_mode,
-+        revision=model_info.revision,
-+        trust_remote_code=model_info.trust_remote_code,
-+        hf_overrides=model_info.hf_overrides,
-+    )
-+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-+
-+    if not any(
-+            hasattr(model_cls, f"_parse_and_validate_{m}_input")
-+            for m in ["image", "video", "audio"]):
-+        pytest.skip(f"{model_arch} does not support tensor schema validation.")
-+
-+    ctx = InputProcessingContext(
-+        model_config,
-+        tokenizer=cached_tokenizer_from_config(model_config),
-+    )
-+    processing_info = factories.info(ctx)
-+    supported_mm_limits = processing_info.get_supported_mm_limits()
-+    limit_mm_per_prompt = {
-+        modality: 3 if limit is None else limit
-+        for modality, limit in supported_mm_limits.items()
-+    }
-+
-+    # Avoid calling model.forward()
-+    def _initialize_kv_caches_v0(self) -> None:
-+        self.cache_config.num_gpu_blocks = 0
-+        self.cache_config.num_cpu_blocks = 0
-+
-+    def _initialize_kv_caches_v1(self, vllm_config):
-+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-+        scheduler_kv_cache_config = get_kv_cache_config(
-+            vllm_config,
-+            kv_cache_specs[0],
-+            10 * GiB_bytes,
-+        )
++            # single video
++            mm_data["video"] = (video, create_metadata(video))
++    return mm_data
 +
-+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-+        return 1, 0, scheduler_kv_cache_config
-+
-+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-+                       _initialize_kv_caches_v0),
-+          patch.object(V1EngineCore, "_initialize_kv_caches",
-+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-+        if model_info.v0_only:
-+            m.setenv("VLLM_USE_V1", "0")
-+
-+        # TODO(Isotr0py): Can we avoid initializing engine?
-+        with (
-+                set_default_torch_num_threads(1),
-+                vllm_runner(
-+                    model_id,
-+                    tokenizer_name=model_info.tokenizer,
-+                    tokenizer_mode=model_info.tokenizer_mode,
-+                    revision=model_info.revision,
-+                    trust_remote_code=model_info.trust_remote_code,
-+                    max_model_len=model_info.max_model_len,
-+                    load_format="dummy",
-+                    hf_overrides=hf_overrides_fn,
-+                    limit_mm_per_prompt=limit_mm_per_prompt,
-+                    enforce_eager=True,
-+                ) as vllm_model,
-+        ):
-+            model_config = vllm_model.llm.llm_engine.model_config
-+            llm_engine = vllm_model.llm.llm_engine
 +
-+            if hasattr(llm_engine, "processor"):
-+                # v1 processor
-+                mm_registry = llm_engine.processor.mm_registry
-+            else:
-+                # v0 input_preprocessor
-+                mm_registry = llm_engine.input_preprocessor.mm_registry
-+
-+            processor = mm_registry.create_processor(model_config)
-+
-+            def validate_model_input(model, modality: str,
-+                                     mm_kwargs: MultiModalKwargs):
-+                method_name = f"_parse_and_validate_{modality}_input"
-+                if hasattr(model, method_name):
-+                    getattr(model, method_name)(**mm_kwargs)
-+
-+            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-+                    model_config, processor):
-+                valid_func = partial(validate_model_input,
-+                                     modality=modality,
-+                                     mm_kwargs=mm_kwargs)
-+                vllm_model.apply_model(valid_func)
+ def _test_processing_correctness(
+     model_id_or_arch: str,
+     hit_rate: float,
+@@ -181,8 +210,10 @@ _IGNORE_MM_KEYS = {
+ }
+ 
+ MM_DATA_PATCHES = {
+-    # GLM4.1V requires video metadata to be included in the input
++    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+     "glm4v": glm4_1v_patch_mm_data,
++    "qwen3_vl": qwen3_vl_patch_mm_data,
++    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
+ }
+ 
+ 
+@@ -328,6 +359,8 @@ def _test_processing_correctness_one(
+     "Qwen/Qwen2.5-VL-3B-Instruct",
+     "Qwen/Qwen2-Audio-7B-Instruct",
+     "Qwen/Qwen2.5-Omni-3B",
++    "Qwen/Qwen3-VL-4B-Instruct",
++    "Qwen/Qwen3-VL-30B-A3B-Instruct",
+     "YannQi/R-4B",
+     "Skywork/Skywork-R1V-38B",
+     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
 diff --git a/tests/models/registry.py b/tests/models/registry.py
-index 84ca0bc60..6d32122a4 100644
+index 0c77ec5ef..696aee3cc 100644
 --- a/tests/models/registry.py
 +++ b/tests/models/registry.py
-@@ -267,6 +267,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
-     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
-     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-+    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
-+                                          trust_remote_code=True,
-+                                          is_available_online=False),
-     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
-     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
-     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
-@@ -373,7 +376,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
-                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
-     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
-                                          extras={"2B": "OpenGVLab/InternVL2-2B",
--                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
-+                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
-+                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
-+                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
-+                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
-                                          trust_remote_code=True),
-     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
-                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
-@@ -397,7 +403,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
-     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                 trust_remote_code=True),
-     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
--                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
-+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
-                                 trust_remote_code=True),
-     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
-                                               trust_remote_code=True,
+@@ -449,6 +449,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
+                                                 max_transformers_version="4.48",  # noqa: E501
+                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
+                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
++    "DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr",
++                                          trust_remote_code=True),
+     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+     "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                               trust_remote_code=True),
+@@ -559,6 +561,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
+                                                           max_model_len=4096),
+     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
++    "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
++                                                        max_model_len=4096,
++                                                        min_transformers_version="4.57"),  # noqa: E501
++    "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
++                                                        max_model_len=4096,
++                                                        min_transformers_version="4.57"),
+     "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
+                                                  trust_remote_code=True),
+     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
 diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
 index 08d9573ec..82a0e0cd8 100644
 --- a/tests/quantization/test_cpu_offload.py
@@ -8824,485 +8619,21 @@ index 34b1b6c2e..4c8082646 100644
          output = llm.generate_greedy(["The capital of France is"],
                                       max_tokens=32)
      assert output
-diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
-new file mode 100644
-index 000000000..d85bc9bbf
---- /dev/null
-+++ b/tests/tool_use/test_seed_oss_tool_parser.py
-@@ -0,0 +1,459 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+# ruff: noqa: E501
-+
-+import json
-+from collections.abc import Generator
-+from typing import Optional
-+
-+import pytest
-+
-+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-+                                              ChatCompletionToolsParam,
-+                                              DeltaMessage, FunctionCall,
-+                                              ToolCall)
-+from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
-+from vllm.transformers_utils.detokenizer import detokenize_incrementally
-+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
-+
-+# Use a common model that is likely to be available
-+MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
-+
-+
-+@pytest.fixture(scope="module")
-+def seed_oss_tokenizer():
-+    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
-+
-+
-+@pytest.fixture
-+def seed_oss_tool_parser(seed_oss_tokenizer):
-+    return SeedOssToolParser(seed_oss_tokenizer)
-+
-+
-+@pytest.fixture
-+def sample_tools():
-+    return [
-+        ChatCompletionToolsParam(
-+            type="function",
-+            function={
-+                "name": "get_weather",
-+                "description": "Get current temperature for a given location.",
-+                "parameters": {
-+                    "type": "object",
-+                    "properties": {
-+                        "location": {
-+                            "type": "string",
-+                            "description":
-+                            "City and country e.g. Bogotá, Colombia"
-+                        },
-+                        "unit": {
-+                            "type": "string",
-+                            "description": "this is the unit of temperature"
-+                        }
-+                    },
-+                    "required": ["location"],
-+                    "additionalProperties": False
-+                },
-+                "returns": {
-+                    "type": "object",
-+                    "properties": {
-+                        "temperature": {
-+                            "type": "number",
-+                            "description": "temperature in celsius"
-+                        }
-+                    },
-+                    "required": ["temperature"],
-+                    "additionalProperties": False
-+                },
-+                "strict": True
-+            }),
-+    ]
-+
-+
-+def assert_tool_calls(actual_tool_calls: list[ToolCall],
-+                      expected_tool_calls: list[ToolCall]):
-+    assert len(actual_tool_calls) == len(expected_tool_calls)
-+
-+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-+                                                    expected_tool_calls):
-+        # Seed-OSS tool call will not generate id
-+        assert actual_tool_call.type == "function"
-+        assert actual_tool_call.function == expected_tool_call.function
-+
-+        assert actual_tool_call.function.name == expected_tool_call.function.name
-+        assert actual_tool_call.function.arguments == expected_tool_call.function.arguments
-+
-+
-+def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
-+    model_output = "This is a test response without any tool calls"
-+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-+        model_output, request=None)  # type: ignore[arg-type]
-+
-+    assert not extracted_tool_calls.tools_called
-+    assert extracted_tool_calls.tool_calls == []
-+    assert extracted_tool_calls.content == model_output
-+
-+
-+@pytest.mark.parametrize(
-+    ids=[
-+        "tool_call_0_thinking_budget",
-+        "tool_call_512_thinkg_budget",
-+        "tool_call_unlimited_thinking_budget",
-+    ],
-+    argnames=["model_output", "expected_tool_calls", "expected_content"],
-+    argvalues=[
-+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         """<seed:tool_call>\n<function=get_weather>\n"""
-+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-+         [
-+             ToolCall(function=FunctionCall(
-+                 name="get_weather",
-+                 arguments=json.dumps({
-+                     "location": "Barcelona, Spain",
-+                 }, ),
-+             ),
-+                      type='function')
-+         ],
-+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         ),
-+        (
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
-+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
-+            """\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps({
-+                        "location": "Barcelona, Spain",
-+                    }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
-+        ),
-+        (
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
-+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps(
-+                        {
-+                            "location": "Barcelona, Spain",
-+                            "unit": "celsius",
-+                        }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think>""",
-+        ),
-+    ],
-+)
-+def test_extract_tool_calls(seed_oss_tool_parser, sample_tools, model_output,
-+                            expected_tool_calls, expected_content):
-+    request = ChatCompletionRequest(model=MODEL,
-+                                    messages=[],
-+                                    tools=sample_tools)
-+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
-+        model_output, request=request)  # type: ignore[arg-type]
-+    assert extracted_tool_calls.tools_called
-+
-+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
-+
-+    assert extracted_tool_calls.content == expected_content
-+
-+
-+def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
-+    model_output = "This is a test response without any tool calls"
-+
-+    result = seed_oss_tool_parser.extract_tool_calls_streaming(
-+        previous_text="his is a test response",
-+        current_text=model_output,
-+        delta_text=" without any tool calls.",
-+        previous_token_ids=[],
-+        current_token_ids=[],
-+        delta_token_ids=[],
-+        request=None,
-+    )
-+
-+    # Should return the delta text as content
-+    assert result is not None
-+    assert hasattr(result, 'content')
-+    assert result.content == " without any tool calls."
-+
-+
-+def stream_delta_message_generator(
-+    seed_oss_tool_parser: SeedOssToolParser,
-+    seed_oss_tokenizer: AnyTokenizer,
-+    model_output: str,
-+    request: Optional[ChatCompletionRequest] = None
-+) -> Generator[DeltaMessage, None, None]:
-+    all_token_ids = seed_oss_tokenizer.encode(model_output,
-+                                              add_special_tokens=False)
-+
-+    previous_text = ""
-+    previous_tokens = None
-+    prefix_offset = 0
-+    read_offset = 0
-+    for i, delta_token in enumerate(all_token_ids):
-+        delta_token_ids = [delta_token]
-+        previous_token_ids = all_token_ids[:i]
-+        current_token_ids = all_token_ids[:i + 1]
-+
-+        (new_tokens, delta_text, new_prefix_offset,
-+         new_read_offset) = detokenize_incrementally(
-+             tokenizer=seed_oss_tokenizer,
-+             all_input_ids=current_token_ids,
-+             prev_tokens=previous_tokens,
-+             prefix_offset=prefix_offset,
-+             read_offset=read_offset,
-+             skip_special_tokens=False,
-+             spaces_between_special_tokens=True,
-+         )
-+
-+        current_text = previous_text + delta_text
-+
-+        delta_message = seed_oss_tool_parser.extract_tool_calls_streaming(
-+            previous_text,
-+            current_text,
-+            delta_text,
-+            previous_token_ids,
-+            current_token_ids,
-+            delta_token_ids,
-+            request=request,
-+        )
-+        if delta_message:
-+            yield delta_message
-+
-+        previous_text = current_text
-+        previous_tokens = (previous_tokens +
-+                           new_tokens if previous_tokens else new_tokens)
-+        prefix_offset = new_prefix_offset
-+        read_offset = new_read_offset
-+
-+
-+@pytest.mark.parametrize(
-+    ids=[
-+        "tool_call_0_thinking_budget",
-+        "tool_call_512_thinkg_budget",
-+        "tool_call_unlimited_thinking_budget",
-+    ],
-+    argnames=["model_output", "expected_tool_calls", "expected_content"],
-+    argvalues=[
-+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         """<seed:tool_call>\n<function=get_weather>\n"""
-+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
-+         [
-+             ToolCall(function=FunctionCall(
-+                 name="get_weather",
-+                 arguments=json.dumps({
-+                     "location": "Barcelona, Spain",
-+                 }, ),
-+             ),
-+                      type='function')
-+         ],
-+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-+         ),
-+        (
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
-+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
-+            """\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps({
-+                        "location": "Barcelona, Spain",
-+                    }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
-+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
-+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
-+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
-+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
-+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
-+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
-+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
-+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
-+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
-+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
-+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
-+        ),
-+        (
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
-+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
-+            [
-+                ToolCall(function=FunctionCall(
-+                    name="get_weather",
-+                    arguments=json.dumps(
-+                        {
-+                            "location": "Barcelona, Spain",
-+                            "unit": "celsius",
-+                        }, ),
-+                ),
-+                         type='function')
-+            ],
-+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
-+            """First, I need to remember the function I can use: get_weather. The function requires a """
-+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
-+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
-+            """let me check the function docstring again. Oh, the function says unit is optional, and """
-+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
-+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
-+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
-+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
-+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
-+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
-+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
-+            """call should be as above. Then wait for the result to come back and tell the user the """
-+            """temperature in Celsius.</seed:think>""",
-+        ),
-+    ],
-+)
-+def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
-+                              sample_tools, model_output, expected_tool_calls,
-+                              expected_content):
-+    """Test incremental streaming behavior"""
-+    request = ChatCompletionRequest(model=MODEL,
-+                                    messages=[],
-+                                    tools=sample_tools)
-+
-+    other_content = ''
-+    tool_states = {}  # Track state per tool index
-+
-+    for delta_message in stream_delta_message_generator(
-+            seed_oss_tool_parser, seed_oss_tokenizer, model_output, request):
-+        # role should never be streamed from tool parser
-+        assert not delta_message.role
-+
-+        if delta_message.content:
-+            other_content += delta_message.content
-+
-+        if delta_message.tool_calls:
-+            for tool_call in delta_message.tool_calls:
-+                idx = tool_call.index
-+
-+                # Initialize state for new tool
-+                if idx not in tool_states:
-+                    tool_states[idx] = {
-+                        "id": None,
-+                        "name": None,
-+                        "arguments": "",
-+                        "type": None
-+                    }
-+
-+                # First chunk should have id, name, and type
-+                if tool_call.id:
-+                    tool_states[idx]["id"] = tool_call.id
-+
-+                if tool_call.type:
-+                    assert tool_call.type == "function"
-+                    tool_states[idx]["type"] = tool_call.type
-+
-+                if tool_call.function:
-+                    if tool_call.function.name:
-+                        # Should only be set once
-+                        assert tool_states[idx]["name"] is None
-+                        tool_states[idx]["name"] = tool_call.function.name
-+
-+                    if tool_call.function.arguments is not None:
-+                        # Accumulate arguments incrementally
-+                        tool_states[idx][
-+                            "arguments"] += tool_call.function.arguments
-+
-+    # Verify final content
-+    assert other_content == expected_content
-+
-+    # Verify we got all expected tool calls
-+    assert len(tool_states) == len(expected_tool_calls)
-+
-+    # Verify each tool call
-+    for idx, expected_tool in enumerate(expected_tool_calls):
-+        state = tool_states[idx]
-+        assert state["id"] is not None
-+        assert state["type"] == "function"
-+        assert state["name"] == expected_tool.function.name
-+
-+        # Parse accumulated arguments
-+        arguments_str = state["arguments"]
-+        assert arguments_str is not None
-+        actual_args = json.loads(arguments_str)
-+        expected_args = json.loads(expected_tool.function.arguments)
-+        assert actual_args == expected_args
-diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
-index 30cfbdda5..f3ce33fcf 100644
---- a/tests/v1/attention/utils.py
-+++ b/tests/v1/attention/utils.py
-@@ -93,6 +93,7 @@ def create_common_attn_metadata(
-         max_query_len=max_query_len,
-         block_table_tensor=block_table_tensor,
-         slot_mapping=slot_mapping,
-+        causal=True,
-     )
- 
+diff --git a/tests/utils.py b/tests/utils.py
+index 16e1e6039..514da44f4 100644
+--- a/tests/utils.py
++++ b/tests/utils.py
+@@ -1140,6 +1140,8 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
+             print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
+ 
+         return attn_backend_list
++    elif current_platform.is_xpu():
++        return ["FLASH_ATTN"]
+     else:
+         raise ValueError("Unsupported platform")
  
 diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
-index 277ea3c83..1624701b0 100644
+index 4dfe1d3bb..56f102253 100644
 --- a/tests/v1/e2e/test_correctness_sliding_window.py
 +++ b/tests/v1/e2e/test_correctness_sliding_window.py
 @@ -18,7 +18,7 @@ class TestConfig:
@@ -9323,29 +8654,31 @@ index 277ea3c83..1624701b0 100644
      ])
  @pytest.mark.parametrize("batch_size", [5])
  @pytest.mark.parametrize("seed", [1])
-@@ -42,7 +42,7 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
- 
-         test_config = model_config[model]
+@@ -46,7 +46,9 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed,
  
--        llm = LLM(model=model)
-+        llm = LLM(model=model, enforce_eager=True, block_size=64)
+         llm = LLM(
+             model=model,
+-            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager)
++            disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
++            enforce_eager=True,
++            block_size=64)
          sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
  
          prompts, answer, indices = prep_prompts(batch_size,
 diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
-index 2423f966a..f0343cae2 100644
+index 0b240b7d4..1ebd4fde4 100644
 --- a/tests/v1/e2e/test_spec_decode.py
 +++ b/tests/v1/e2e/test_spec_decode.py
-@@ -68,7 +68,7 @@ def test_ngram_correctness(
-     with monkeypatch.context() as m:
+@@ -90,7 +90,7 @@ def test_ngram_correctness(
          m.setenv("VLLM_USE_V1", "1")
+         test_prompts = get_test_prompts(mm_enabled=False)
  
 -        ref_llm = LLM(model=model_name, max_model_len=1024)
-+        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True, block_size=32, dtype="float16")
++        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True, block_size=64, dtype="float16")
          ref_outputs = ref_llm.chat(test_prompts, sampling_config)
          del ref_llm
          torch.cuda.empty_cache()
-@@ -83,6 +83,10 @@ def test_ngram_correctness(
+@@ -105,6 +105,10 @@ def test_ngram_correctness(
                  "num_speculative_tokens": 3,
              },
              max_model_len=1024,
@@ -9356,7 +8689,54 @@ index 2423f966a..f0343cae2 100644
          )
          spec_outputs = spec_llm.chat(test_prompts, sampling_config)
          matches = 0
-@@ -131,7 +135,12 @@ def test_eagle_correctness(
+@@ -125,30 +129,22 @@ def test_ngram_correctness(
+         cleanup_dist_env_and_memory()
+ 
+ 
+-@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+-    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+-    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+-    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+-    pytest.param(
+-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+-        False,
+-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+-    pytest.param(
+-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+-        True,
+-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+-    (("eagle", "eagle618/deepseek-v3-random",
+-      "eagle618/eagle-deepseek-v3-random", 1), False),
+-],
+-                         ids=[
+-                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+-                             "llama4_eagle", "llama4_eagle_mm",
+-                             "deepseek_eagle"
+-                         ])
++@pytest.mark.parametrize(
++    ["model_setup", "mm_enabled"],
++    [
++        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
++        # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
++        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
++          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
++        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
++          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
++    ],
++    ids=[
++        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
++        # "qwen3_eagle3",
++        "llama3_eagle",
++        "llama3_eagle3",
++    ])
+ @pytest.mark.parametrize("attn_backend",
+                          get_attn_backend_list_based_on_platform())
+ def test_eagle_correctness(
+@@ -188,7 +184,12 @@ def test_eagle_correctness(
  
          ref_llm = LLM(model=model_name,
                        max_model_len=2048,
@@ -9370,7 +8750,7 @@ index 2423f966a..f0343cae2 100644
          ref_outputs = ref_llm.chat(test_prompts, sampling_config)
          del ref_llm
          torch.cuda.empty_cache()
-@@ -147,6 +156,10 @@ def test_eagle_correctness(
+@@ -204,6 +205,10 @@ def test_eagle_correctness(
                  "num_speculative_tokens": 3,
                  "max_model_len": 2048,
              },
@@ -9381,14 +8761,14 @@ index 2423f966a..f0343cae2 100644
              max_model_len=2048,
          )
          spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+diff --git a/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
 new file mode 100644
-index 000000000..45779d169
+index 000000000..ae4909b29
 --- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
-@@ -0,0 +1,162 @@
++++ b/tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
+@@ -0,0 +1,156 @@
 +#!/bin/bash
-+set -xe
++set -e
 +
 +# Hosts / ports
 +PREFILL_HOST=${PREFILL_HOST:-"localhost"}
@@ -9403,24 +8783,26 @@ index 000000000..45779d169
 +
 +
 +# Model to run.
-+MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
++MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 +MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
-+BLOCK_SIZE=${BLOCK_SIZE:-32}
++BLOCK_SIZE=${BLOCK_SIZE:-16}
 +
 +
 +# execution env
 +GIT_ROOT=$(git rev-parse --show-toplevel)
 +EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
-+CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
-+CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
 +
-+OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
++OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.xpu_accuracy_test_outputs.txt"}
 +
 +# Trap the SIGINT signal (triggered by Ctrl+C)
 +trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
 +
++cleanup() {
++  echo "Cleaning up any running vLLM instances..."
++  pkill -f "vllm serve" || true
++  sleep 2
++}
 +
-+# Waits for vLLM server to start.
 +wait_for_server() {
 +  local host=$1
 +  local port=$2
@@ -9430,71 +8812,65 @@ index 000000000..45779d169
 +    done" && return 0 || return 1
 +}
 +
-+# Cleanup function
-+cleanup() {
-+    echo "Caught Ctrl+C, cleaning up..."
-+    # Cleanup commands
-+    pgrep python | xargs kill -9 || true
-+    # pkill -f python || true
-+    echo "Cleanup complete. Exiting."
-+}
-+
 +launch_baseline() {
-+  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  VLLM_LOGGING_LEVEL=DEBUG \
++  BASELINE_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:0 \
 +  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
 +  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
 +      --host ${BASELINE_HOST} \
 +      --port ${BASELINE_PORT} \
 +      --max-model-len ${MAX_MODEL_LEN}\
 +      --seed 42 \
++      -tp 1 \
 +      --block-size ${BLOCK_SIZE} \
-+      --gpu-memory-utilization 0.5 \
++      --gpu-memory-utilization 0.8 \
 +      --disable-log-requests \
++      --dtype float16 \
 +      --enforce-eager"
-+  echo ${BASELINE_BASE_CMD}
-+  ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
++  echo ${BASELINE_BASE_CMD}      
++  bash -c "${BASELINE_BASE_CMD}" &
++  sleep 10
++  wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
 +}
 +
 +launch_pd() {
-+  PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
++  PREFILL_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:0 \
 +  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
 +  VLLM_USE_V1=1 \
 +  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
 +  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
-+  PJRT_DEVICE=TPU \
 +  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
 +      --host ${PREFILL_HOST} \
 +      --port ${PREFILL_PORT} \
 +      --max-model-len ${MAX_MODEL_LEN}\
 +      --seed 42 \
 +      --block-size ${BLOCK_SIZE} \
 +      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
++      --dtype float16 \
++      -tp 1 \
++      --gpu-memory-utilization 0.8 \
 +      --disable-log-requests \
 +      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 +
 +
-+  DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
++  DECODE_BASE_CMD="
++  ONEAPI_DEVICE_SELECTOR=level_zero:1 \
 +  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
 +  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
 +  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
++  VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve $MODEL_NAME \
 +      --host ${DECODE_HOST} \
 +      --port ${DECODE_PORT} \
 +      --max-model-len ${MAX_MODEL_LEN}\
 +      --seed 42 \
 +      --block-size ${BLOCK_SIZE} \
 +      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
++      -tp 1 \
++      --dtype float16 \
++      --gpu-memory-utilization 0.8 \
 +      --disable-log-requests \
 +      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 +
@@ -9503,8 +8879,8 @@ index 000000000..45779d169
 +  sleep 2
 +
 +  # execute on hosts
-+  ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
-+  ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
++  bash -c "${PREFILL_BASE_CMD}" &
++  bash -c "${DECODE_BASE_CMD}" &
 +  sleep 1
 +  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
 +  sleep 1
@@ -9513,13 +8889,14 @@ index 000000000..45779d169
 +}
 +
 +launch_pd_proxy(){
-+  PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
++  PROXY_BASE_CMD="
 +  python3 ${EXP_ROOT}/toy_proxy_server.py \
 +  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
 +  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
 +  --host=${PROXY_HOST} --port ${PROXY_PORT}"
-+  echo ${PROXY_BASE_CMD}
-+  ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
++  echo ${PROXY_BASE_CMD} 
++  bash -c "${PROXY_BASE_CMD}" &
++  sleep 2
 +}
 +
 +run_tests(){
@@ -9531,8 +8908,6 @@ index 000000000..45779d169
 +
 +# run non-disagg. baseline & save outputs
 +launch_baseline
-+sleep 2
-+wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
 +run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
 +cleanup
 +sleep 10
@@ -9541,7 +8916,6 @@ index 000000000..45779d169
 +# run disagg. & do exact-match with the outputs from baseline
 +launch_pd
 +launch_pd_proxy
-+sleep 10
 +run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
 +echo "-----P/D success----"
 +
@@ -9549,445 +8923,19 @@ index 000000000..45779d169
 +cleanup
 +
 +exit 0
-\ No newline at end of file
-diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
-new file mode 100644
-index 000000000..c37c92fdf
---- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
-@@ -0,0 +1,128 @@
-+#!/bin/bash
-+set -xe
-+
-+# Hosts / ports
-+PREFILL_HOST=${PREFILL_HOST:-"localhost"}
-+PREFILL_PORT=${PREFILL_PORT:-8100}
-+PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
-+DECODE_HOST=${DECODE_HOST:-"localhost"}
-+DECODE_PORT=${DECODE_PORT:-8200}
-+PROXY_HOST=${PROXY_HOST:-"localhost"}
-+PROXY_PORT=${PROXY_PORT:-8192}
-+BASELINE_HOST=${BASELINE_HOST:-"localhost"}
-+BASELINE_PORT=${BASELINE_PORT:-9290}
-+
-+
-+# Model to run.
-+MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
-+MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
-+BLOCK_SIZE=${BLOCK_SIZE:-32}
-+
-+
-+# execution env
-+GIT_ROOT=$(git rev-parse --show-toplevel)
-+EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
-+CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
-+CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
-+
-+OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
-+
-+# Trap the SIGINT signal (triggered by Ctrl+C)
-+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
-+
-+# Waits for vLLM server to start.
-+wait_for_server() {
-+  local host=$1
-+  local port=$2
-+  timeout 1200 bash -c "
-+    until curl -s ${host}:${port}/v1/completions > /dev/null; do
-+      sleep 1
-+    done" && return 0 || return 1
-+}
-+
-+# Cleanup function
-+cleanup() {
-+    echo "Caught Ctrl+C, cleaning up..."
-+    # Cleanup commands
-+    pgrep python | xargs kill -9 || true
-+    # pkill -f python || true
-+    echo "Cleanup complete. Exiting."
-+}
-+
-+
-+launch_pd() {
-+  PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
-+  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${PREFILL_HOST} \
-+      --port ${PREFILL_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
-+
-+
-+  DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  UCX_TLS=tcp \
-+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
-+  VLLM_LOGGING_LEVEL=DEBUG \
-+  VLLM_USE_V1=1 \
-+  PJRT_DEVICE=TPU \
-+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
-+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
-+      --host ${DECODE_HOST} \
-+      --port ${DECODE_PORT} \
-+      --max-model-len ${MAX_MODEL_LEN}\
-+      --seed 42 \
-+      --block-size ${BLOCK_SIZE} \
-+      --enforce-eager \
-+      --gpu-memory-utilization 0.5 \
-+      --disable-log-requests \
-+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
-+
-+  echo ${PREFILL_BASE_CMD}
-+  echo ${DECODE_BASE_CMD}
-+  sleep 2
-+
-+  # execute on hosts
-+  ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
-+  ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
-+  sleep 1
-+  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
-+  sleep 1
-+  wait_for_server ${DECODE_HOST} ${DECODE_PORT}
-+  sleep 1
-+}
-+
-+launch_pd_proxy(){
-+  PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
-+  python3 ${EXP_ROOT}/toy_proxy_server.py \
-+  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
-+  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
-+  --host=${PROXY_HOST} --port ${PROXY_PORT}"
-+  echo ${PROXY_BASE_CMD}
-+  ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
-+}
-+
-+
-+# run disagg. & do exact-match with the outputs from baseline
-+launch_pd
-+launch_pd_proxy
-+sleep 10
-+
-+PREFILL_HOST=${PREFILL_HOST} \
-+PREFILL_PORT=${PREFILL_PORT} \
-+DECODE_HOST=${DECODE_HOST} \
-+DECODE_PORT=${DECODE_PORT} \
-+PROXY_HOST=${PROXY_HOST} \
-+PROXY_PORT=${PROXY_PORT} python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-\ No newline at end of file
-diff --git a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
-new file mode 100644
-index 000000000..00e62f351
---- /dev/null
-+++ b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
-@@ -0,0 +1,162 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+import argparse
-+import json
-+import os
-+import time
-+
-+import openai
-+import requests
-+
-+MAX_OUTPUT_LEN = 30
-+
-+SAMPLE_PROMPTS = (
-+    "Red Hat is the best company in the world to work for because it works on "
-+    "open source software, which means that all the contributions are "
-+    "delivered to the community. As a result, when working on projects like "
-+    "vLLM we are able to meet many amazing people from various organizations "
-+    "like AMD, Google, NVIDIA, ",
-+    "We hold these truths to be self-evident, that all men are created equal, "
-+    "that they are endowed by their Creator with certain unalienable Rights, "
-+    "that among these are Life, Liberty and the pursuit of Happiness.--That "
-+    "to secure these rights, Governments are instituted among Men, deriving "
-+    "their just powers from the consent of the governed, ",
-+)
-+
-+
-+def check_vllm_server(url: str, timeout=5, retries=3) -> bool:
-+    """
-+    Checks if the vLLM server is ready by sending a GET request to the
-+    /health endpoint.
-+
-+    Args:
-+        url (str): The base URL of the vLLM server.
-+        timeout (int): Timeout in seconds for the request.
-+        retries (int): Number of retries if the server is not ready.
-+
-+    Returns:
-+        bool: True if the server is ready, False otherwise.
-+    """
-+    for attempt in range(retries):
-+        try:
-+            response = requests.get(url, timeout=timeout)
-+            if response.status_code == 200:
-+                return True
-+            else:
-+                print(f"Attempt {attempt + 1}: Server returned status code "
-+                      "{response.status_code}")
-+        except requests.exceptions.RequestException as e:
-+            print(f"Attempt {attempt + 1}: Error connecting to server: {e}")
-+        time.sleep(1)  # Wait before retrying
-+    return False
-+
-+
-+def run_simple_prompt(base_url: str, model_name: str,
-+                      input_prompt: str) -> str:
-+    client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
-+    completion = client.completions.create(model=model_name,
-+                                           prompt=input_prompt,
-+                                           max_tokens=MAX_OUTPUT_LEN,
-+                                           temperature=0.0,
-+                                           seed=42)
-+
-+    # print("-" * 50)
-+    # print(f"Completion results for {model_name}:")
-+    # print(completion)
-+    # print("-" * 50)
-+    return completion.choices[0].text
-+
-+
-+def main():
-+    """
-+    This script demonstrates how to accept two optional string arguments
-+    ("service_url" and "file_name") from the command line, each with a
-+    default value of an empty string, using the argparse module.
-+    """
-+    parser = argparse.ArgumentParser(description="vLLM client script")
-+
-+    parser.add_argument(
-+        "--service_url",  # Name of the first argument
-+        type=str,
-+        required=True,
-+        help="The vLLM service URL.")
-+
-+    parser.add_argument(
-+        "--model_name",  # Name of the first argument
-+        type=str,
-+        required=True,
-+        help="model_name",
-+    )
-+
-+    parser.add_argument(
-+        "--mode",  # Name of the second argument
-+        type=str,
-+        default="baseline",
-+        help="mode: baseline==non-disagg, or disagg",
-+    )
-+
-+    parser.add_argument(
-+        "--file_name",  # Name of the second argument
-+        type=str,
-+        default=".vllm_output.txt",
-+        help="the file that saves the output tokens ",
-+    )
-+
-+    args = parser.parse_args()
-+
-+    for arg in vars(args):
-+        print(f"{arg}: {getattr(args, arg)}")
-+
-+    if args.mode == "baseline":
-+        # non-disagg
-+        health_check_url = f"{args.service_url}/health"
-+    else:
-+        # disagg proxy
-+        health_check_url = f"{args.service_url}/healthcheck"
-+        if not os.path.exists(args.file_name):
-+            raise ValueError(
-+                f"In disagg mode, the output file {args.file_name} from "
-+                "non-disagg. baseline does not exist.")
-+
-+    service_url = f"{args.service_url}/v1"
-+
-+    if not check_vllm_server(health_check_url):
-+        raise RuntimeError(
-+            f"vllm server: {args.service_url} is not ready yet!")
-+
-+    output_strs = dict()
-+    for prompt in SAMPLE_PROMPTS:
-+        output_str = run_simple_prompt(base_url=service_url,
-+                                       model_name=args.model_name,
-+                                       input_prompt=prompt)
-+        print(f"Prompt: {prompt}, output: {output_str}")
-+        output_strs[prompt] = output_str
-+
-+    if args.mode == "baseline":
-+        # baseline: save outputs
-+        try:
-+            with open(args.file_name, 'w') as json_file:
-+                json.dump(output_strs, json_file, indent=4)
-+        except OSError as e:
-+            print(f"Error writing to file: {e}")
-+            raise
-+    else:
-+        # disagg. verify outputs
-+        baseline_outputs = None
-+        try:
-+            with open(args.file_name) as json_file:
-+                baseline_outputs = json.load(json_file)
-+        except OSError as e:
-+            print(f"Error writing to file: {e}")
-+            raise
-+        assert isinstance(baseline_outputs, dict)
-+        assert len(baseline_outputs) == len(output_strs)
-+        for prompt, output in baseline_outputs.items():
-+            assert prompt in output_strs, f"{prompt} not included"
-+            assert output == output_strs[prompt], (
-+                f"baseline_output: {output} != PD output: {output_strs[prompt]}"
-+            )
-+
-+
-+if __name__ == "__main__":
-+    main()
-diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-index 95465a25f..8439e30be 100644
---- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
-@@ -4,8 +4,11 @@ import os
- 
- import openai
- 
-+PREFILL_HOST = os.getenv("PREFILL_HOST", "localhost")
- PREFILL_PORT = os.getenv("PREFILL_PORT", None)
-+DECODE_HOST = os.getenv("DECODE_HOST", "localhost")
- DECODE_PORT = os.getenv("DECODE_PORT", None)
-+PROXY_HOST = os.getenv("PROXY_HOST", "localhost")
- PROXY_PORT = os.getenv("PROXY_PORT", None)
- 
- if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
-@@ -21,15 +24,15 @@ def test_edge_cases():
-     # Set the OpenAI API key and base URL
-     decode_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{DECODE_PORT}/v1",
-+        base_url=f"http://{DECODE_HOST}:{DECODE_PORT}/v1",
-     )
-     prefill_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{PREFILL_PORT}/v1",
-+        base_url=f"http://{PREFILL_HOST}:{PREFILL_PORT}/v1",
-     )
-     proxy_client = openai.OpenAI(
-         api_key="MY_KEY",
--        base_url=f"http://localhost:{PROXY_PORT}/v1",
-+        base_url=f"http://{PROXY_HOST}:{PROXY_PORT}/v1",
-     )
- 
-     # Get the list of models
-diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-index c58cb0286..66e237da0 100644
---- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
-@@ -3,6 +3,7 @@
- 
- import argparse
- import itertools
-+import logging
- import os
- import uuid
- from contextlib import asynccontextmanager
-@@ -11,9 +12,8 @@ import httpx
- from fastapi import FastAPI, Request
- from fastapi.responses import StreamingResponse
- 
--from vllm.logger import init_logger
--
--logger = init_logger(__name__)
-+logger = logging.getLogger(__name__)
-+logger.setLevel(logging.DEBUG)
- 
- 
- @asynccontextmanager
-diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
-index b4d4348c7..cc59287a9 100644
---- a/tests/v1/test_oracle.py
-+++ b/tests/v1/test_oracle.py
-@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
-     "openai/whisper-large-v3",  # transcription
-     "facebook/bart-large-cnn",  # encoder decoder
-     "state-spaces/mamba-130m-hf",  # mamba1
--    "BAAI/bge-m3",  # embedding
- ]
- 
- MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
-index 0b892bd9d..00d98a873 100644
---- a/tests/v1/test_utils.py
-+++ b/tests/v1/test_utils.py
-@@ -1,9 +1,8 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- 
--import re
--
- import pytest
-+import regex as re
- import requests
- import torch
- 
-diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
-index cf296a3b5..6e60c1ccc 100644
---- a/vllm/_custom_ops.py
-+++ b/vllm/_custom_ops.py
-@@ -1282,7 +1282,10 @@ def scaled_fp8_quant(
-                 output, input.contiguous(), scale, scale_ub)
-         else:
-             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
--            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-+            if current_platform.is_xpu():
-+                torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
-+            else:
-+                torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-     else:
-         assert scale.numel() == 1, f"{scale.shape}"
-         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
-index 7533bf5ef..283b19900 100644
+index c2868c040..41ee138bc 100644
 --- a/vllm/_ipex_ops.py
 +++ b/vllm/_ipex_ops.py
-@@ -6,6 +6,8 @@ from typing import Optional
- import torch
- 
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
-+from vllm.utils import direct_register_custom_op
- 
- logger = init_logger(__name__)
- 
-@@ -14,6 +16,8 @@ try:
+@@ -15,6 +15,7 @@ try:
  except ImportError as e:
      logger.warning("Import error msg: %s", e.msg)
  
 +import vllm._C.ops
-+
  
  class ipex_ops:
  
-@@ -77,7 +81,8 @@ class ipex_ops:
-         assert kv_cache_dtype == "auto"
-         num_heads = out.size(1)
-         num_queries_per_tokens = num_heads // num_kv_heads
--        ipex.llm.modules.PagedAttention.single_query_kv_attention(
-+        #ipex.llm.modules.PagedAttention.single_query_kv_attention(
-+        torch.xpu.paged_attention_v1(
-             out,
-             query.contiguous(),
-             key_cache.view_as(value_cache),
-@@ -142,10 +147,12 @@ class ipex_ops:
+@@ -143,10 +144,12 @@ class ipex_ops:
          cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
          is_neox: bool,
      ) -> None:
@@ -10004,15 +8952,19 @@ index 7533bf5ef..283b19900 100644
  
      @staticmethod
      def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
-@@ -161,14 +168,18 @@ class ipex_ops:
+@@ -162,14 +165,23 @@ class ipex_ops:
      @staticmethod
      def rms_norm(input: torch.Tensor, weight: torch.Tensor,
                   epsilon: float) -> torch.Tensor:
--        return ipex.llm.functional.rms_norm(input, weight, epsilon)
 +        # return ipex.llm.functional.rms_norm(input, weight, epsilon)
 +        tmp = torch.empty_like(input)
 +        vllm._C.ops.rms_norm(tmp, input, weight, epsilon)
 +        return tmp
++
++    @staticmethod
++    def rms_norm_origin(input: torch.Tensor, weight: torch.Tensor,
++                 epsilon: float) -> torch.Tensor:
+         return ipex.llm.functional.rms_norm(input, weight, epsilon)
  
      @staticmethod
      def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
@@ -10027,216 +8979,171 @@ index 7533bf5ef..283b19900 100644
  
      @staticmethod
      def varlen_attention(
-@@ -348,3 +359,166 @@ class ipex_ops:
-     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
-                     block_mapping: torch.Tensor) -> None:
-         torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
-+
-+    @staticmethod
-+    def _bgmv_shrink(inputs: torch.Tensor,
-+                     lora_a_weights: torch.Tensor,
-+                     output_tensor: torch.Tensor,
-+                     lora_indices_tensor: torch.Tensor,
-+                     scaling: float = 1.0) -> None:
-+
-+        ipex.llm.functional.bgmv_shrink(inputs, lora_a_weights, output_tensor,
-+                                        lora_indices_tensor, scaling)
-+
-+    def _bgmv_shrink_fake(inputs: torch.Tensor,
-+                          lora_a_weights: torch.Tensor,
-+                          output_tensor: torch.Tensor,
-+                          lora_indices_tensor: torch.Tensor,
-+                          scaling: float = 1.0) -> None:
-+        pass
-+
-+    @staticmethod
-+    def bgmv_expand(inputs: torch.Tensor,
-+                    lora_b_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    add_inputs: bool = True) -> None:
-+        ipex.llm.functional.bgmv_expand(inputs, lora_b_weights, output_tensor,
-+                                        lora_indices_tensor, add_inputs)
-+
-+    @staticmethod
-+    def _bgmv_expand_slice(inputs: torch.Tensor,
-+                           lora_b_weights: torch.Tensor,
-+                           output_tensor: torch.Tensor,
-+                           lora_indices_tensor: torch.Tensor,
-+                           slice_offset: int,
-+                           slice_size: int,
-+                           add_inputs: bool = True) -> None:
-+        ipex.llm.functional.bgmv_expand_slice(inputs, lora_b_weights,
-+                                              output_tensor,
-+                                              lora_indices_tensor,
-+                                              slice_offset, slice_size,
-+                                              add_inputs)
-+
-+    def _bgmv_expand_slice_fake(inputs: torch.Tensor,
-+                                lora_b_weights: torch.Tensor,
-+                                output_tensor: torch.Tensor,
-+                                lora_indices_tensor: torch.Tensor,
-+                                slice_offset: int,
-+                                slice_size: int,
-+                                add_inputs: bool = True) -> None:
-+        pass
+@@ -207,6 +219,12 @@ class ipex_ops:
+                                                  is_causal, return_softmax,
+                                                  gen_)
+         else:  # XPU build
++            if max_seqlen_q is None:
++                assert seqlen_q is not None
++                max_seqlen_q = int((seqlen_q[1:] - seqlen_q[:-1]).max().item())
++            if max_seqlen_k is None:
++                assert seqlen_k is not None
++                max_seqlen_k = int((seqlen_k[1:] - seqlen_k[:-1]).max().item())
+             ipex.llm.functional.varlen_attention(
+                 query.contiguous(), key.contiguous(), value.contiguous(), out,
+                 seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
+@@ -300,6 +318,7 @@ class ipex_ops:
+             causal,
+             block_table,
+             alibi_slopes,
++            sink=s_aux,
+             softcap=softcap,
+             window_size_left=real_window_size[0],
+             window_size_right=real_window_size[1],
+diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
+index bb05b468f..e4207020c 100644
+--- a/vllm/attention/layer.py
++++ b/vllm/attention/layer.py
+@@ -23,6 +23,7 @@ from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.model_executor.models.vision import get_vit_attn_backend
+ from vllm.platforms import _Backend, current_platform
+ from vllm.utils import direct_register_custom_op
+ 
+@@ -30,6 +31,15 @@ logger = init_logger(__name__)
+ USE_XFORMERS_OPS = None
+ 
+ 
 +
-+    @staticmethod
-+    def sgmv_shrink(inputs: torch.Tensor,
-+                    lora_a_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    b_seq_start_loc: torch.Tensor,
-+                    seq_len_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    batches: int,
-+                    max_seq_length: int,
-+                    token_nums: int,
-+                    scaling: float = 1.0) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_shrink(inputs, lora_a_weights, output_tensor,
-+                                        b_seq_start_loc, seq_len_tensor,
-+                                        lora_indices_tensor, batches,
-+                                        max_seq_length, scaling)
++def check_upstream_fa_availability(dtype: torch.dtype):
++    if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
++    ) and current_platform.has_device_capability(80):
++        from transformers.utils import is_flash_attn_2_available
++        return is_flash_attn_2_available()
++    return False
 +
-+    @staticmethod
-+    def sgmv_expand(inputs: torch.Tensor,
-+                    lora_b_weights: torch.Tensor,
-+                    output_tensor: torch.Tensor,
-+                    b_seq_start_loc: torch.Tensor,
-+                    seq_len_tensor: torch.Tensor,
-+                    lora_indices_tensor: torch.Tensor,
-+                    batches: int,
-+                    max_seq_length: int,
-+                    token_nums: int,
-+                    add_inputs: bool = False) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_expand(inputs, lora_b_weights, output_tensor,
-+                                        b_seq_start_loc, seq_len_tensor,
-+                                        lora_indices_tensor, batches,
-+                                        max_seq_length, add_inputs)
 +
-+    @staticmethod
-+    def sgmv_expand_slice(inputs: torch.Tensor,
-+                          lora_b_weights: torch.Tensor,
-+                          output_tensor: torch.Tensor,
-+                          b_seq_start_loc: torch.Tensor,
-+                          seq_len_tensor: torch.Tensor,
-+                          lora_indices_tensor: torch.Tensor,
-+                          batches: int,
-+                          max_seq_length: int,
-+                          token_nums: int,
-+                          slice_offset: int,
-+                          slice_size: int,
-+                          add_inputs: bool = False) -> None:
-+        assert inputs.size(0) == token_nums
-+        ipex.llm.functional.sgmv_expand_slice(inputs, lora_b_weights,
-+                                              output_tensor, b_seq_start_loc,
-+                                              seq_len_tensor,
-+                                              lora_indices_tensor, batches,
-+                                              max_seq_length, slice_offset,
-+                                              slice_size, add_inputs)
-+
-+    # @staticmethod
-+    # def lora_expand(inputs: torch.Tensor,
-+    #                 lora_b_weights: List[torch.Tensor],
-+    #                 output_tensor: torch.Tensor,
-+    #                 token_lora_mapping: torch.Tensor,
-+    #                 token_indices_sorted_by_lora_ids: torch.Tensor,
-+    #                 num_tokens_per_lora: torch.Tensor,
-+    #                 lora_token_start_loc: torch.Tensor,
-+    #                 lora_ids: torch.Tensor,
-+    #                 offset_start: int = 0,
-+    #                 add_inputs: bool = False) -> None:
-+    #     ipex.llm.functional.lora_expand(inputs, lora_b_weights,
-+    #                                     output_tensor, token_lora_mapping,
-+    #                                     token_indices_sorted_by_lora_ids,
-+    #                                     num_tokens_per_lora, num_tokens_per_lora, #no_qa
-+    #                                     lora_token_start_loc, lora_ids,
-+    #                                     offset_start, add_inputs)
-+
-+    # @staticmethod
-+    # def lora_shrink(inputs: torch.Tensor,
-+    #                 lora_a_weights: List[torch.Tensor],
-+    #                 output_tensor: torch.Tensor,
-+    #                 token_lora_mapping: torch.Tensor,
-+    #                 token_indices_sorted_by_lora_ids: torch.Tensor,
-+    #                 num_tokens_per_lora: torch.Tensor,
-+    #                 lora_token_start_loc: torch.Tensor,
-+    #                 lora_ids: torch.Tensor,
-+    #                 scaling: float) -> None:
-+    #     ipex.llm.functional.lora_shrink(inputs, lora_a_weights,
-+    #                                     output_tensor, token_lora_mapping,
-+    #                                     token_indices_sorted_by_lora_ids,
-+    #                                     num_tokens_per_lora, num_tokens_per_lora,
-+    #                                     lora_token_start_loc, lora_ids,
-+    #                                     scaling)
+ def check_xformers_availability():
+     global USE_XFORMERS_OPS
+     if USE_XFORMERS_OPS is not None:
+@@ -349,29 +359,55 @@ class MultiHeadAttention(nn.Module):
+             f"divisible by num_kv_heads ({self.num_kv_heads})"
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+ 
++        # During model initialization, the default dtype is set as the model
++        # weight and activation dtype.
+         dtype = torch.get_default_dtype()
+-        attn_backend = get_attn_backend(head_size,
+-                                        dtype,
+-                                        kv_cache_dtype=None,
+-                                        block_size=16,
+-                                        is_attention_free=False)
+-        backend = backend_name_to_enum(attn_backend.get_name())
++
++        # Determine the attention backend
++        backend = get_vit_attn_backend(head_size=head_size, dtype=dtype)
++
++        # Some auto-selected backends can be upgraded
++        # to upstream flash attention if available.
++        # If vllm native fa is selected, we use it directly.
++        use_upstream_fa = False
++        if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
++                dtype):
++            backend = _Backend.FLASH_ATTN
++            use_upstream_fa = True
++
+         if current_platform.is_rocm():
+             # currently, only torch_sdpa is supported on rocm
+             self.attn_backend = _Backend.TORCH_SDPA
+         else:
 +
+             self.attn_backend = backend if backend in {
+                 _Backend.TORCH_SDPA,
+                 _Backend.TORCH_SDPA_VLLM_V1,
+                 _Backend.XFORMERS,
+                 _Backend.PALLAS_VLLM_V1,
+                 _Backend.ROCM_AITER_FA,
+-            } else current_platform.get_vit_attn_backend()
++                _Backend.FLASH_ATTN,
++                _Backend.FLASH_ATTN_VLLM_V1,
++            } else _Backend.TORCH_SDPA
+ 
+         if (self.attn_backend == _Backend.XFORMERS
+                 and not check_xformers_availability()):
+             self.attn_backend = _Backend.TORCH_SDPA
+ 
++        if self.attn_backend in {
++                _Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1
++        }:
++            if use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++                self._flash_attn_varlen_func = flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
++                self._flash_attn_varlen_func = flash_attn_varlen_func
 +
-+try:
-+    direct_register_custom_op(
-+        op_name="bgmv_shrink",
-+        op_func=ipex_ops._bgmv_shrink,
-+        mutates_args=["output_tensor"],
-+        fake_impl=ipex_ops._bgmv_shrink_fake,
-+        dispatch_key=current_platform.dispatch_key,
-+    )
-+    bgmv_shrink = torch.ops.vllm.bgmv_shrink
-+
-+    direct_register_custom_op(
-+        op_name="bgmv_expand_slice",
-+        op_func=ipex_ops._bgmv_expand_slice,
-+        mutates_args=["output_tensor"],
-+        fake_impl=ipex_ops._bgmv_expand_slice_fake,
-+        dispatch_key=current_platform.dispatch_key,
-+    )
-+    bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-+
-+except AttributeError:
-+    bgmv_shrink = ipex_ops._bgmv_shrink
-+    bgmv_expand_slice = ipex_ops._bgmv_expand_slice
-diff --git a/vllm/assets/video.py b/vllm/assets/video.py
-index 8ab0e9760..6d09c912a 100644
---- a/vllm/assets/video.py
-+++ b/vllm/assets/video.py
-@@ -94,6 +94,24 @@ def video_get_metadata(path: str) -> dict[str, Any]:
-     return metadata
- 
- 
-+def video_get_metadata(path: str) -> dict[str, Any]:
-+    cap = cv2.VideoCapture(path)
-+    if not cap.isOpened():
-+        raise ValueError(f"Could not open video file {path}")
-+
-+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-+    fps = cap.get(cv2.CAP_PROP_FPS)
-+    duration = total_frames / fps if fps > 0 else 0
-+
-+    metadata = {
-+        "total_num_frames": total_frames,
-+        "fps": fps,
-+        "duration": duration,
-+        "video_backend": "opencv"
-+    }
-+    return metadata
++        logger.info_once(
++            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
++            f"use_upstream_fa: {use_upstream_fa}")
 +
+     def forward(
+         self,
+         query: torch.Tensor,
+@@ -380,7 +416,7 @@ class MultiHeadAttention(nn.Module):
+     ) -> torch.Tensor:
+         """Input shape: batch_size x seq_len x hidden_size"""
+         # TODO(Isotr0py): Use existing backend implementations and support FA3
+-        bsz, q_len, _ = query.size()
++        bsz, q_len = query.size()[:2]
+         kv_len = key.size(1)
+ 
+         query = query.view(bsz, q_len, self.num_heads, self.head_size)
+@@ -392,7 +428,31 @@ class MultiHeadAttention(nn.Module):
+             key = torch.repeat_interleave(key, num_repeat, dim=2)
+             value = torch.repeat_interleave(value, num_repeat, dim=2)
+ 
+-        if self.attn_backend == _Backend.XFORMERS:
++        if self.attn_backend in {
++                _Backend.FLASH_ATTN,
++                _Backend.FLASH_ATTN_VLLM_V1,
++        }:
++
++            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
++                                        step=q_len,
++                                        dtype=torch.int32,
++                                        device=query.device)
++            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
++                                        step=kv_len,
++                                        dtype=torch.int32,
++                                        device=key.device)
 +
- VideoAssetName = Literal["baby_reading"]
- 
- 
-diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
-index 178453ecd..8b350cd0d 100644
---- a/vllm/attention/layer.py
-+++ b/vllm/attention/layer.py
-@@ -185,7 +185,7 @@ class Attention(nn.Module):
-         # opaque custom op. For other platforms, we directly call them
-         # and let torch.compile handle them.
-         self.use_direct_call = not current_platform.is_cuda_alike(
--        ) and not current_platform.is_cpu()
-+        ) and not current_platform.is_cpu() and not current_platform.is_xpu()
++            out = self._flash_attn_varlen_func(
++                query.flatten(0, 1),
++                key.flatten(0, 1),
++                value.flatten(0, 1),
++                cu_seqlens_q=cu_seqlens_q,
++                cu_seqlens_k=cu_seqlens_k,
++                max_seqlen_q=q_len,
++                max_seqlen_k=kv_len,
++                softmax_scale=self.scale,
++            )
++        elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
  
-         self.use_output = attn_backend.accept_output_buffer
-         compilation_config = get_current_vllm_config().compilation_config
-@@ -431,6 +431,87 @@ def maybe_save_kv_layer_to_connector(
+             out = xops.memory_efficient_attention_forward(query,
+@@ -400,7 +460,8 @@ class MultiHeadAttention(nn.Module):
+                                                           value,
+                                                           scale=self.scale)
+         elif (self.attn_backend == _Backend.TORCH_SDPA
+-              or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1):
++              or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1
++              or self.attn_backend == _Backend.IPEX):
+             query, key, value = (x.transpose(1, 2)
+                                  for x in (query, key, value))
+             out = F.scaled_dot_product_attention(query,
+@@ -463,6 +524,87 @@ def maybe_save_kv_layer_to_connector(
                              attn_metadata[layer_name])
  
  
@@ -10324,1525 +9231,202 @@ index 178453ecd..8b350cd0d 100644
  def unified_attention(
      query: torch.Tensor,
      key: torch.Tensor,
-diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
-index 673fb5866..65dfce992 100644
---- a/vllm/compilation/backends.py
-+++ b/vllm/compilation/backends.py
-@@ -414,7 +414,7 @@ class VllmBackend:
-         self.prefix = prefix or model_tag
- 
-         global global_graph_pool
--        if global_graph_pool is None:
-+        if global_graph_pool is None and not current_platform.is_xpu():
-             global_graph_pool = current_platform.graph_pool_handle()
- 
-         # TODO: in the future, if we want to use multiple
-diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
-index 286221d32..b15ed7d7a 100644
---- a/vllm/compilation/fix_functionalization.py
-+++ b/vllm/compilation/fix_functionalization.py
-@@ -9,6 +9,7 @@ import torch
- from torch._higher_order_ops.auto_functionalize import auto_functionalized
- 
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
+diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
+index a98eb2a78..14095ca4d 100644
+--- a/vllm/benchmarks/serve.py
++++ b/vllm/benchmarks/serve.py
+@@ -430,7 +430,8 @@ async def benchmark(
+     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+         input_requests[0].prompt,
+         input_requests[0].prompt_len,
+-        input_requests[0].expected_output_len,
++        #input_requests[0].expected_output_len,
++        10,
+         input_requests[0].multi_modal_data,
+     )
  
- from .fx_utils import is_func
- from .vllm_inductor_pass import VllmInductorPass
-@@ -32,6 +33,8 @@ class FixFunctionalizationPass(VllmInductorPass):
-         self.nodes_to_remove: list[torch.fx.Node] = []
-         count = 0
-         for node in graph.nodes:
-+            if current_platform.is_xpu():
-+                continue
-             if not is_func(node, auto_functionalized):
-                 continue  # Avoid deep if-elif nesting
+diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
+index 8026d4c9e..ba1ef9c43 100644
+--- a/vllm/config/__init__.py
++++ b/vllm/config/__init__.py
+@@ -259,7 +259,7 @@ def is_init_field(cls: ConfigType, name: str) -> bool:
+     return next(f for f in fields(cls) if f.name == name).init
+ 
+ 
+-TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
++TokenizerMode = Literal["auto", "slow", "mistral", "custom", "bpe-qwen"]
+ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
+ MMEncoderTPMode = Literal["weights", "data"]
+ 
+@@ -301,6 +301,7 @@ class ModelConfig:
+     - "auto" will use the fast tokenizer if available.\n
+     - "slow" will always use the slow tokenizer.\n
+     - "mistral" will always use the tokenizer from `mistral_common`.\n
++    - "bpe-qwen" will use the tokenizer from `bpe_qwen`.\n
+     - "custom" will use --tokenizer to select the preregistered tokenizer."""
+     trust_remote_code: bool = False
+     """Trust remote code (e.g., from HuggingFace) when downloading the model
+diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
+index 067315deb..b236bae26 100644
+--- a/vllm/distributed/device_communicators/xpu_communicator.py
++++ b/vllm/distributed/device_communicators/xpu_communicator.py
+@@ -25,6 +25,12 @@ class XpuCommunicator(DeviceCommunicatorBase):
+         super().__init__(cpu_group, device, device_group, unique_name)
+         if self.use_all2all:
+             all2all_backend = envs.VLLM_ALL2ALL_BACKEND
++            if all2all_backend != "naive":
++                logger.warning(
++                    "`%s` all2all manager is not supported on XPU."
++                    "Falling back to `naive` all2all manager for XPU.",
++                    all2all_backend)
++                all2all_backend = "naive"
+             if all2all_backend == "naive":
+                 from .all2all import NaiveAll2AllManager
+                 self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+@@ -67,3 +73,16 @@ class XpuCommunicator(DeviceCommunicatorBase):
+ 
+     def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
+         dist.broadcast(input_, src=src, group=self.device_group)
++
++    def dispatch(
++            self, hidden_states: torch.Tensor,
++            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
++        assert self.all2all_manager is not None
++        hidden_states, router_logits = self.all2all_manager.dispatch(
++            hidden_states, router_logits)
++        return hidden_states, router_logits
++
++    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        assert self.all2all_manager is not None
++        hidden_states = self.all2all_manager.combine(hidden_states)
++        return hidden_states
+diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
+index b53dbfb3a..48d205856 100644
+--- a/vllm/entrypoints/chat_utils.py
++++ b/vllm/entrypoints/chat_utils.py
+@@ -431,6 +431,51 @@ def resolve_mistral_chat_template(
+     return None
  
-diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
-index 58216a1f0..27e634532 100644
---- a/vllm/compilation/pass_manager.py
-+++ b/vllm/compilation/pass_manager.py
-@@ -5,12 +5,15 @@ from torch import fx as fx
  
- from vllm.config import VllmConfig
- from vllm.logger import init_logger
-+from vllm.platforms import current_platform
++_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
++"""
++Used in `_try_get_processor_chat_template` to avoid calling
++`cached_get_processor` again if the processor fails to be loaded.
 +
-+if current_platform.is_cuda_alike():
-+    from .fusion import FusionPass
-+    from .collective_fusion import AllReduceFusionPass, AsyncTPPass
-+    from .fusion_attn import AttnFusionPass
- 
- from .activation_quant_fusion import ActivationQuantFusionPass
--from .collective_fusion import AllReduceFusionPass, AsyncTPPass
- from .fix_functionalization import FixFunctionalizationPass
--from .fusion import FusionPass
--from .fusion_attn import AttnFusionPass
- from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
- from .noop_elimination import NoOpEliminationPass
- from .sequence_parallelism import SequenceParallelismPass
-@@ -18,6 +21,11 @@ from .vllm_inductor_pass import VllmInductorPass
- 
- logger = init_logger(__name__)
- 
-+try:
-+    from .fusion import FusionPass
-+except AttributeError:
-+    logger.warning("import FusionPass error.")
++This is needed because `lru_cache` does not cache when an exception happens.
++"""
 +
- 
- class PostGradPassManager(CustomGraphPass):
-     """
-@@ -56,7 +64,7 @@ class PostGradPassManager(CustomGraphPass):
-             if self.pass_config.enable_async_tp:
-                 self.passes += [AsyncTPPass(config)]
- 
--        if self.pass_config.enable_fusion:
-+        if self.pass_config.enable_fusion and not current_platform.is_xpu():
-             self.passes += [FusionPass.instance(config)]
-             self.passes += [ActivationQuantFusionPass(config)]
- 
-diff --git a/vllm/config.py b/vllm/config.py
-index f038cdd64..8b79d6728 100644
---- a/vllm/config.py
-+++ b/vllm/config.py
-@@ -1719,6 +1719,10 @@ class CacheConfig:
-             raise ValueError("CPU offload space must be non-negative"
-                              f", but got {self.cpu_offload_gb}")
- 
-+        if self.cpu_offload_gb > 0 and envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
-+            raise ValueError("CPU offload can't work together with"
-+                             "OFFLOAD_WEIGHTS_BEFORE_QUANT")
-+
-         if self.gpu_memory_utilization > 1.0:
-             raise ValueError(
-                 "GPU memory utilization must be less than 1.0. Got "
-@@ -2730,6 +2734,14 @@ class SpeculativeConfig:
-                 "architectures": ["Glm4MoeMTPModel"]
-             })
- 
-+        if hf_config.architectures[0] == "Glm4MoeForCausalLM":
-+            hf_config.model_type = "glm4_moe_mtp"
-+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
-+            hf_config.update({
-+                "num_hidden_layers": 0,
-+                "n_predict": n_predict,
-+                "architectures": ["Glm4MoeMTPModel"]
-+            })
-         return hf_config
- 
-     def __post_init__(self):
-diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
-index dee5ed7a2..9bbfee118 100644
---- a/vllm/distributed/device_communicators/xpu_communicator.py
-+++ b/vllm/distributed/device_communicators/xpu_communicator.py
-@@ -7,8 +7,13 @@ import torch
- import torch.distributed as dist
- from torch.distributed import ProcessGroup
- 
-+import vllm.envs as envs
-+from vllm.logger import init_logger
-+
- from .base_device_communicator import DeviceCommunicatorBase
- 
-+logger = init_logger(__name__)
-+
- 
- class XpuCommunicator(DeviceCommunicatorBase):
- 
-@@ -18,6 +23,14 @@ class XpuCommunicator(DeviceCommunicatorBase):
-                  device_group: Optional[ProcessGroup] = None,
-                  unique_name: str = ""):
-         super().__init__(cpu_group, device, device_group, unique_name)
-+        if self.use_all2all:
-+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-+            if all2all_backend == "naive":
-+                from .all2all import NaiveAll2AllManager
-+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
-+                logger.info("Using naive all2all manager.")
-+            else:
-+                raise ValueError(f"Unknown all2all backend: {all2all_backend}")
- 
-     def all_reduce(self, input_) -> torch.Tensor:
-         dist.all_reduce(input_, group=self.device_group)
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-index e1245775b..8bbdd7e06 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
-@@ -32,7 +32,7 @@ The class provides the following primitives:
- 
- import enum
- from abc import ABC, abstractmethod
--from typing import TYPE_CHECKING, Any, Optional
-+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
- 
- import torch
- 
-@@ -46,6 +46,12 @@ if TYPE_CHECKING:
-     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
-     from vllm.v1.request import Request
- 
-+# s_tensor_list, d_tensor_list, s_indices, d_indices, direction
-+CopyBlocksOp = Callable[[
-+    dict[str, torch.Tensor], dict[
-+        str, torch.Tensor], list[int], list[int], Literal["h2d", "d2h"]
-+], None]
-+
- logger = init_logger(__name__)
- 
- 
-@@ -127,6 +133,13 @@ class KVConnectorBase_V1(ABC):
-         """
-         return
- 
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        """
-+        Set the xPU-specific ops for copying KV between host and device.
-+        Needed when host buffer is used for kv transfer (e.g., in NixlConnector)
-+        """
-+        return
-+
-     @abstractmethod
-     def start_load_kv(self, forward_context: "ForwardContext",
-                       **kwargs) -> None:
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-index 0c5986bfa..c06cda356 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-@@ -1,6 +1,7 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- import contextlib
-+import logging
- import math
- import queue
- import threading
-@@ -20,14 +21,14 @@ from vllm import envs
- from vllm.attention.selector import backend_name_to_enum, get_attn_backend
- from vllm.config import VllmConfig
- from vllm.distributed.kv_transfer.kv_connector.v1.base import (
--    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
-+    CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
- from vllm.distributed.parallel_state import (
-     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
-     get_tp_group)
- from vllm.distributed.utils import divide
- from vllm.forward_context import ForwardContext
- from vllm.logger import init_logger
--from vllm.platforms import _Backend
-+from vllm.platforms import _Backend, current_platform
- from vllm.utils import make_zmq_path, make_zmq_socket, round_down
- from vllm.v1.core.sched.output import SchedulerOutput
- from vllm.v1.request import RequestStatus
-@@ -40,6 +41,7 @@ if TYPE_CHECKING:
- Transfer = tuple[int, float]  # (xfer_handle, start_time)
- EngineId = str
- ReqId = str
-+
- GET_META_MSG = b"get_meta_msg"
- 
- logger = init_logger(__name__)
-@@ -52,6 +54,13 @@ except ImportError:
-     logger.warning("NIXL is not available")
-     NixlWrapper = None
- 
-+# Supported xPUs and types of kv transfer buffer.
-+# {xPU: tuple of supported kv buffer types}
-+_NIXL_SUPPORTED_XPUS = {
-+    "cuda": ("cuda", ),
-+    "tpu": ("cpu", ),
-+}
-+
- 
- class NixlAgentMetadata(
-         msgspec.Struct,
-@@ -80,6 +89,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
- 
-     def __init__(self):
-         self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
-+        self.reqs_to_save: dict[ReqId, ReqMeta] = {}
-         self.reqs_to_send: dict[ReqId, float] = {}
- 
-     def add_new_req(
-@@ -87,8 +97,12 @@ class NixlConnectorMetadata(KVConnectorMetadata):
-         request_id: ReqId,
-         local_block_ids: list[int],
-         kv_transfer_params: dict[str, Any],
-+        load_remote_cache: bool = True,
-+        save_to_host: bool = False,
-     ):
--        self.reqs_to_recv[request_id] = ReqMeta(
-+        # save and load are mutually exclusive
-+        assert load_remote_cache ^ save_to_host
-+        _req = ReqMeta(
-             local_block_ids=local_block_ids,
-             remote_block_ids=kv_transfer_params["remote_block_ids"],
-             remote_engine_id=kv_transfer_params["remote_engine_id"],
-@@ -97,6 +111,10 @@ class NixlConnectorMetadata(KVConnectorMetadata):
-             # P workers don't need to receive tp_size from proxy here.
-             tp_size=kv_transfer_params.get("tp_size", 1),
-         )
-+        if save_to_host:
-+            self.reqs_to_save[request_id] = _req
-+        if load_remote_cache:
-+            self.reqs_to_recv[request_id] = _req
- 
- 
- class NixlConnector(KVConnectorBase_V1):
-@@ -155,6 +173,10 @@ class NixlConnector(KVConnectorBase_V1):
-         assert self.connector_worker is not None
-         self.connector_worker.register_kv_caches(kv_caches)
- 
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        assert self.connector_worker is not None
-+        self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
-+
-     def get_finished(self,
-                      finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
-         """Get the finished recving and sending requests."""
-@@ -177,8 +199,11 @@ class NixlConnector(KVConnectorBase_V1):
-         pass
- 
-     def wait_for_save(self):
--        """NixlConnector does not save explicitly."""
--        pass
-+        assert self.connector_worker is not None
-+        assert isinstance(self._connector_metadata, NixlConnectorMetadata)
-+        if self.connector_worker.use_host_buffer and \
-+           self.connector_worker.copy_blocks:
-+            self.connector_worker.save_kv_to_host(self._connector_metadata)
- 
- 
- class NixlConnectorScheduler:
-@@ -193,12 +218,15 @@ class NixlConnectorScheduler:
-             envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-             vllm_config.parallel_config.data_parallel_rank *
-             vllm_config.parallel_config.tensor_parallel_size)
-+        self.use_host_buffer = \
-+            vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
-         logger.info("Initializing NIXL Scheduler %s", engine_id)
- 
-         # Requests that need to start recv/send.
-         # New requests are added by update_state_after_alloc in
-         # the scheduler. Used to make metadata passed to Worker.
-         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
-+        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
-         # Reqs to send and their expiration time
-         self._reqs_need_send: dict[ReqId, float] = {}
- 
-@@ -248,7 +276,25 @@ class NixlConnectorScheduler:
-             "num_external_tokens=%s, kv_transfer_params=%s",
-             num_external_tokens, params)
- 
--        if params is not None and params.get("do_remote_prefill"):
-+        if not params:
-+            return
-+        if self.use_host_buffer and params.get("do_remote_decode"):
-+            # NOTE: when accelerator is not directly supported by Nixl,
-+            # prefilled blocks need to be saved to host memory before transfer.
-+
-+            # figure out full computed blocks to save
-+            block_ids = blocks.get_block_ids()[0]
-+            all_full = request.num_tokens % self.block_size == 0
-+            full_block_ids = (block_ids if all_full else block_ids[:-1])
-+            # TODO: skip the blocks that are already in the host xfer buffer.
-+            # Currently, the host xfer buffer block is 1-to-1 mapped to device
-+            # kv blocks, so host blocks won't be flushed as long as its device
-+            # block is not overwritten; and it will be safe to skip saving them
-+            # to host xfer buffer.
-+            if full_block_ids:
-+                self._reqs_need_save[request.request_id] = \
-+                    (request, full_block_ids)
-+        elif params.get("do_remote_prefill"):
-             if params.get("remote_block_ids"):
-                 if all(p in params for p in ("remote_engine_id", "remote_host",
-                                              "remote_port")):
-@@ -260,6 +306,7 @@ class NixlConnectorScheduler:
-                     # Get unhashed blocks to pull from remote.
-                     self._reqs_need_recv[request.request_id] = (
-                         request, local_block_ids)
-+
-                 else:
-                     logger.warning(
-                         "Got invalid KVTransferParams: %s. This "
-@@ -284,10 +331,21 @@ class NixlConnectorScheduler:
-                 kv_transfer_params=req.kv_transfer_params,
-             )
- 
--        # Clear the list once workers start the transfers
--        self._reqs_need_recv.clear()
-+        for req_id, (req, block_ids) in self._reqs_need_save.items():
-+            assert req.kv_transfer_params is not None
-+            meta.add_new_req(
-+                request_id=req_id,
-+                local_block_ids=block_ids,
-+                kv_transfer_params=req.kv_transfer_params,
-+                load_remote_cache=False,
-+                save_to_host=True,
-+            )
- 
-         meta.reqs_to_send = self._reqs_need_send
-+
-+        # Clear the list once workers start the transfers
-+        self._reqs_need_recv.clear()
-+        self._reqs_need_save.clear()
-         self._reqs_need_send = {}
- 
-         return meta
-@@ -379,9 +437,36 @@ class NixlConnectorWorker:
-         self.tp_rank = get_tensor_model_parallel_rank()
-         self.world_size = get_tensor_model_parallel_world_size()
-         self.tp_group = get_tp_group()
-+        self.num_blocks = 0
- 
-         # KV Caches and nixl tracking data.
--        self.kv_caches: dict[str, torch.Tensor] = {}
-+        self.device_type = current_platform.device_type
-+        self.kv_buffer_device: str = \
-+            vllm_config.kv_transfer_config.kv_buffer_device
-+        if self.device_type not in _NIXL_SUPPORTED_XPUS:
-+            raise RuntimeError(f"{self.device_type} is not supported.")
-+        elif self.kv_buffer_device not in _NIXL_SUPPORTED_XPUS[
-+                self.device_type]:
-+            raise RuntimeError(
-+                f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-+                "is not supported.")
-+        self.device_kv_caches: dict[str, torch.Tensor] = {}
-+
-+        # cpu kv buffer for xfer
-+        # used when xPU memory can not be registered under nixl
-+        self.host_xfer_buffers: dict[str, torch.Tensor] = {}
-+        self.use_host_buffer = self.kv_buffer_device == "cpu"
-+        if self.kv_buffer_device == "cuda":
-+            self.nixl_memory_type = "VRAM"
-+        elif self.kv_buffer_device == "cpu":
-+            self.nixl_memory_type = "DRAM"
-+        else:
-+            raise RuntimeError(
-+                f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
-+                "is not supported.")
 +
-+        # Note: host xfer buffer ops when use_host_buffer is True
-+        self.copy_blocks: Optional[CopyBlocksOp] = None
- 
-         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
-         # rank will still only pull from a single remote TP worker.
-@@ -404,6 +489,7 @@ class NixlConnectorWorker:
- 
-         # In progress transfers.
-         # [req_id -> list[handle]]
-+        self._recving_metadata: dict[ReqId, ReqMeta] = {}
-         self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
-         # Track the expiration time of requests that are waiting to be sent.
-         self._reqs_to_send: dict[ReqId, float] = {}
-@@ -440,6 +526,7 @@ class NixlConnectorWorker:
-         self.backend_name = backend.get_name()
-         attn_backend = backend_name_to_enum(self.backend_name)
-         self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
-+        self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1
-         logger.debug("Detected attention backend %s", self.backend_name)
- 
-         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
-@@ -529,6 +616,31 @@ class NixlConnectorWorker:
-         # Remote rank -> agent name.
-         return {p_remote_rank: remote_agent_name}
- 
-+    def initialize_host_xfer_buffer(
-+            self, kv_caches: dict[str, torch.Tensor]) -> None:
-+        """
-+        Initialize transfer buffer in CPU mem for accelerators
-+        NOT directly supported by NIXL (e.g., tpu)
-+        """
-+        xfer_buffers: dict[str, torch.Tensor] = {}
-+        try:
-+            for layer_name, kv_cache in kv_caches.items():
-+                kv_shape = kv_cache.shape
-+                kv_dtype = kv_cache.dtype
-+                xfer_buffers[layer_name] = torch.empty(kv_shape,
-+                                                       dtype=kv_dtype,
-+                                                       device="cpu")
-+        except MemoryError as e:
-+            logger.error("NIXLConnectorWorker gets %s.", e)
-+            raise
-+
-+        self.host_xfer_buffers = xfer_buffers
-+
-+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-+        """Assign copy (d2h, h2d) operations when host buffer is used."""
-+        assert self.use_host_buffer
-+        self.copy_blocks = copy_operation
-+
-     def _background_nixl_handshake(self, req_id: str,
-                                    remote_engine_id: EngineId, meta: ReqMeta):
-         # Do NIXL handshake in background and add to _ready_requests when done.
-@@ -562,47 +674,76 @@ class NixlConnectorWorker:
-         _, first_kv_cache = next(iter(kv_caches.items()))
-         kv_elem_size = first_kv_cache.element_size()
- 
-+        if self.use_host_buffer:
-+            self.initialize_host_xfer_buffer(kv_caches=kv_caches)
-+            assert len(self.host_xfer_buffers) == len(kv_caches), (
-+                f"host_buffer: {len(self.host_xfer_buffers)}, "
-+                f"kv_caches: {len(kv_caches)}")
-+            xfer_buffers = self.host_xfer_buffers
-+        else:
-+            xfer_buffers = kv_caches
-+            assert not self.host_xfer_buffers, (
-+                "host_xfer_buffer should not be initialized when "
-+                f"kv_buffer_device is {self.kv_buffer_device}")
-+
-         # TODO(tms): Find a more robust way to detect and handle MLA
-         # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
-         # KV memory layout is HND, as opposed to the default NHD. Note that it
-         # will only affects the strides. For MLA instead, we make require no
-         # such thing and resort to the standard layout.
-         use_mla = len(first_kv_cache.shape) == 3
--        assert use_mla == self.use_mla
--
--        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
--        # once it goes live, as a single kv layout is expected for xfers.
--        if use_mla:
--            # MLA case.
-+        if self.device_type == "tpu":
-+            assert not use_mla, f"{self.kv_buffer_device} does not support MLA."
-+            assert self._use_pallas_v1, f"attn backend: {self.backend_name}"
-+            # tpu (v1) kv shape per layer:
-+            # (num_blocks, block_size, num_kv_heads * 2, head_size)
-             self.num_blocks = first_kv_cache.shape[0]
--            block_rank = 2  # [block_size, latent_dim]
-+            block_rank = 3  # [block_size, kv_heads, head_dim]
-             block_shape = first_kv_cache.shape[-block_rank:]
--            block_size, kv_latent_dim = block_shape
--            self.slot_size_bytes = kv_elem_size * kv_latent_dim
--        else:
--            # [2 (k and v), num_blocks, ...]
--            if self._use_flashinfer:
--                # FlashInfer swaps 2<->num_blocks dimensions.
-+            block_size, n_kv_heads_x_2, head_dim = block_shape
-+            self.slot_size_bytes = kv_elem_size * n_kv_heads_x_2 * head_dim
-+        elif self.device_type == "cuda":
-+            assert use_mla == self.use_mla
-+            # TODO (NickLucche) not compatible with hybrid allocator.
-+            # Enforce check once it goes live, as a single kv layout
-+            # is expected for xfers.
-+            if use_mla:
-+                # MLA case.
-                 self.num_blocks = first_kv_cache.shape[0]
--                block_rank = 4  # [2, block_size, kv_heads, head_dim]
-+                block_rank = 2  # [block_size, latent_dim]
-+                block_shape = first_kv_cache.shape[-block_rank:]
-+                block_size, kv_latent_dim = block_shape
-+                self.slot_size_bytes = kv_elem_size * kv_latent_dim
-             else:
--                self.num_blocks = first_kv_cache.shape[1]
--                block_rank = 3  # [block_size, kv_heads, head_dim]
--            block_shape = first_kv_cache.shape[-block_rank:]
--            block_size, n_kv_heads, head_dim = block_shape[-3:]
--            # head size in bytes.
--            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
--        assert block_size == self.block_size
-+                # [2 (k and v), num_blocks, ...]
-+                if self._use_flashinfer:
-+                    # FlashInfer swaps 2<->num_blocks dimensions.
-+                    self.num_blocks = first_kv_cache.shape[0]
-+                    block_rank = 4  # [2, block_size, kv_heads, head_dim]
-+                else:
-+                    self.num_blocks = first_kv_cache.shape[1]
-+                    block_rank = 3  # [block_size, kv_heads, head_dim]
-+                block_shape = first_kv_cache.shape[-block_rank:]
-+                block_size, n_kv_heads, head_dim = block_shape[-3:]
-+                # head size in bytes.
-+                self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-+            assert block_size == self.block_size
-+        else:
-+            raise RuntimeError(
-+                f"{self.device_type} ({self.backend_name}) is not supported.")
-+
-         # TODO(tms): self.block_len needs to be per-layer for sliding window,
-         # hybrid attn, etc
-         # block size in bytes
-         self.block_len = kv_elem_size * math.prod(block_shape)
-         logger.info(
--            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
--            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
--            self.num_blocks, block_shape, first_kv_cache.shape)
-+            "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
-+            "use_host_buffer: %s, num_blocks: %s, block_shape: %s, "
-+            "per_layer_kv_cache_shape: %s", use_mla, self.kv_buffer_device,
-+            self.use_host_buffer, self.num_blocks, block_shape,
-+            first_kv_cache.shape)
-         self.dst_num_blocks[self.engine_id] = self.num_blocks
--        self.kv_caches = kv_caches
-+        self.device_kv_caches = kv_caches
-         kv_caches_base_addr = []
-         caches_data = []
- 
-@@ -614,19 +755,21 @@ class NixlConnectorWorker:
-         # (roughly 8KB vs 5KB).
-         # Conversely for FlashInfer, K and V are transferred in the same tensor
-         # to better exploit the memory layout (ie num_blocks is the first dim).
--        for cache_or_caches in kv_caches.values():
-+        for cache_or_caches in xfer_buffers.values():
-             # Normalize to always be a list of caches
--            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
--                else cache_or_caches
-+            cache_list = [cache_or_caches] if use_mla \
-+                         or self._use_pallas_v1 or self._use_flashinfer \
-+                         else cache_or_caches
-             for cache in cache_list:
-                 base_addr = cache.data_ptr()
-                 region_len = self.num_blocks * self.block_len
--                caches_data.append(
--                    (base_addr, region_len, cache.device.index, ""))
-+                # NOTE: use tp_rank for device_id since multi-node TP
-+                # is rarely used.
-+                caches_data.append((base_addr, region_len, self.tp_rank, ""))
-                 kv_caches_base_addr.append(base_addr)
-         self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
-         self.num_regions = len(caches_data)
--        self.num_layers = len(self.kv_caches.keys())
-+        self.num_layers = len(xfer_buffers.keys())
- 
-         # TODO(mgoin): remove this once we have hybrid memory allocator
-         # Optimization for models with local attention (Llama 4)
-@@ -648,7 +791,8 @@ class NixlConnectorWorker:
-                          self.block_window_per_layer)
-             assert len(self.block_window_per_layer) == self.num_layers
- 
--        descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
-+        descs = self.nixl_wrapper.get_reg_descs(caches_data,
-+                                                self.nixl_memory_type)
-         logger.debug("Registering descs: %s", caches_data)
-         self.nixl_wrapper.register_memory(descs)
-         logger.debug("Done registering descs")
-@@ -666,11 +810,13 @@ class NixlConnectorWorker:
-                 block_offset = block_id * self.block_len
-                 addr = base_addr + block_offset
-                 # (addr, len, device id)
-+                # TODO: does device_id matter to DRAM?
-                 blocks_data.append((addr, self.block_len, self.tp_rank))
-         logger.debug("Created %s blocks for src engine %s and rank %s",
-                      len(blocks_data), self.engine_id, self.tp_rank)
- 
--        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-+                                                 self.nixl_memory_type)
-         # NIXL_INIT_AGENT to be used for preparations of local descs.
-         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-             "NIXL_INIT_AGENT", descs)
-@@ -755,6 +901,8 @@ class NixlConnectorWorker:
-         tp_ratio = divide(self._tp_size[self.engine_id],
-                           self._tp_size[engine_id])
-         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
-+        assert not self._use_pallas_v1 or tp_ratio == 1, \
-+               "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
- 
-         # Handle tp_size>num_kv_heads: replicate KV cache.
-         total_num_kv_heads = self.model_config.get_total_num_kv_heads()
-@@ -813,13 +961,43 @@ class NixlConnectorWorker:
-             self.tp_rank)
- 
-         # Register with NIXL.
--        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
-+                                                 self.nixl_memory_type)
-         self.dst_xfer_side_handles[
-             engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                 remote_agent_name, descs)
- 
-         return remote_agent_name
- 
-+    def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
-+        """copy recved kv from host buffer to device."""
-+        assert self.use_host_buffer
-+        assert self.copy_blocks is not None
-+
-+        local_block_ids = meta.local_block_ids
-+        self.copy_blocks(self.host_xfer_buffers, self.device_kv_caches,
-+                         local_block_ids, local_block_ids, "h2d")
-+        if logger.isEnabledFor(logging.DEBUG):
-+            logger.debug(
-+                "synced recved kv of request[%s] to device kv buffer,"
-+                "local_block_ids: %s. ", req_id,
-+                ",".join(map(str, meta.local_block_ids)))
-+
-+    def save_kv_to_host(self, metadata: NixlConnectorMetadata):
-+        """copy kv from device to host buffer."""
-+        assert self.use_host_buffer
-+        assert self.copy_blocks is not None
-+
-+        for req_id, meta in metadata.reqs_to_save.items():
-+            if logger.isEnabledFor(logging.DEBUG):
-+                logger.debug(
-+                    "save_load_kv for request[%s] to host xfer buffer."
-+                    "local_block_ids: %s. ", req_id,
-+                    ",".join(map(str, meta.local_block_ids)))
-+            # blocking
-+            self.copy_blocks(self.device_kv_caches, self.host_xfer_buffers,
-+                             meta.local_block_ids, meta.local_block_ids, "d2h")
-+
-     def get_finished(self) -> tuple[set[str], set[str]]:
-         """
-         Get requests that are done sending or recving on this specific worker.
-@@ -834,6 +1012,12 @@ class NixlConnectorWorker:
-                 "and %s requests done recving", self.tp_rank,
-                 len(done_sending), len(done_recving))
- 
-+        if self.use_host_buffer:
-+            for req_id in done_recving:
-+                meta = self._recving_metadata.pop(req_id)
-+                assert meta, f"{req_id} not found in recving_metadata list"
-+                self.sync_recved_kv_to_device(req_id, meta)
-+
-         # Handle timeout to avoid stranding blocks on remote.
-         now = time.perf_counter()
-         while self._reqs_to_send:
-@@ -904,6 +1088,8 @@ class NixlConnectorWorker:
-                 "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
-                 remote_engine_id, len(meta.local_block_ids),
-                 len(meta.remote_block_ids))
-+            if self.use_host_buffer:
-+                self._recving_metadata[req_id] = meta
-             if remote_engine_id not in self._remote_agents:
-                 # Initiate handshake with remote engine to exchange metadata.
-                 with self._handshake_lock:
-diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
-index 1f7a14920..46d20e91d 100644
---- a/vllm/distributed/parallel_state.py
-+++ b/vllm/distributed/parallel_state.py
-@@ -44,6 +44,7 @@ from vllm.distributed.utils import StatelessProcessGroup
- from vllm.logger import init_logger
- from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
-                         resolve_obj_by_qualname, supports_custom_op)
-+from vllm.envs import CCL_P2P_CPU
- 
- 
- @dataclass
-@@ -690,6 +691,8 @@ class GroupCoordinator:
-                     and tensor.numel() % all_gather_size == 0):
-                 tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
- 
-+            if envs.CCL_P2P_CPU:
-+                tensor = tensor.cpu()
-             if tensor.is_cpu:
-                 # use metadata_group for CPU tensors
-                 torch.distributed.send(tensor,
-@@ -734,9 +737,12 @@ class GroupCoordinator:
-         tensor_dict: dict[str, Any] = {}
-         for key, value in recv_metadata_list:
-             if isinstance(value, TensorMetadata):
-+                tensor_device = value.device
-+                if envs.CCL_P2P_CPU:
-+                    tensor_device = 'cpu'
-                 tensor = torch.empty(value.size,
-                                      dtype=value.dtype,
--                                     device=value.device)
-+                                     device=tensor_device)
-                 if tensor.numel() == 0:
-                     # Skip broadcasting empty tensors.
-                     tensor_dict[key] = tensor
-@@ -761,6 +767,8 @@ class GroupCoordinator:
-                     torch.distributed.recv(tensor,
-                                            src=self.ranks[src],
-                                            group=group)
-+                if envs.CCL_P2P_CPU:
-+                    tensor = tensor.to(value.device)
-                 if use_all_gather:
-                     # do the allgather
-                     tensor = all_gather_group.all_gather(  # type: ignore
-diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
-index aec75f826..6b15c9f95 100644
---- a/vllm/engine/arg_utils.py
-+++ b/vllm/engine/arg_utils.py
-@@ -1651,7 +1651,8 @@ class EngineArgs:
- 
-         if (self.max_num_seqs is None
-                 and usage_context in default_max_num_seqs):
--            self.max_num_seqs = default_max_num_seqs[usage_context]
-+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
-+                                    self.max_num_batched_tokens or sys.maxsize)
- 
-             logger.debug("Setting max_num_seqs to %d for %s usage context.",
-                          self.max_num_seqs, use_context_value)
-diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
-index 2f766a2da..680733966 100644
---- a/vllm/entrypoints/llm.py
-+++ b/vllm/entrypoints/llm.py
-@@ -239,7 +239,7 @@ class LLM:
-                 compilation_config_instance = compilation_config
-         else:
-             compilation_config_instance = CompilationConfig()
--
-+        kwargs.pop("device", None)
-         engine_args = EngineArgs(
-             model=model,
-             task=task,
-@@ -1230,6 +1230,44 @@ class LLM:
- 
-         return [ClassificationRequestOutput.from_base(item) for item in items]
- 
-+    def reward(
-+        self,
-+        prompts: Union[PromptType, Sequence[PromptType]],
-+        /,
-+        *,
-+        truncate_prompt_tokens: Optional[int] = None,
-+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-+        pooling_params: Optional[Union[PoolingParams,
-+                                       Sequence[PoolingParams]]] = None,
-+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-+    ) -> list[PoolingRequestOutput]:
-+        """
-+        Generate rewards for each prompt.
-+        Args:
-+            prompts: The prompts to the LLM. You may pass a sequence of prompts
-+                for batch inference. See [PromptType][vllm.inputs.PromptType]
-+                for more details about the format of each prompts.
-+            use_tqdm: If `True`, shows a tqdm progress bar.
-+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
-+                it is used to create the progress bar.
-+                If `False`, no progress bar is created.
-+            lora_request: LoRA request to use for generation, if any.
-+            pooling_params: The pooling parameters for pooling. If None, we
-+                use the default pooling parameters.
-+        Returns:
-+            A list of `PoolingRequestOutput` objects containing the
-+            pooled hidden states in the same order as the input prompts.
-+        """
++def _try_get_processor_chat_template(
++    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
++    model_config: ModelConfig,
++) -> Optional[str]:
++    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
++    if cache_key in _PROCESSOR_CHAT_TEMPLATES:
++        return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 +
-+        return self.encode(
-+            prompts,
-+            use_tqdm=use_tqdm,
-+            lora_request=lora_request,
-+            pooling_params=pooling_params,
-+            truncate_prompt_tokens=truncate_prompt_tokens,
-+            pooling_task="encode",
++    try:
++        processor = cached_get_processor(
++            tokenizer.name_or_path,
++            processor_cls=(
++                PreTrainedTokenizer,
++                PreTrainedTokenizerFast,
++                ProcessorMixin,
++            ),
++            trust_remote_code=model_config.trust_remote_code,
 +        )
-+
-     def _embedding_score(
-         self,
-         tokenizer: AnyTokenizer,
-diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
-index 88c8aa929..10b9c63e0 100644
---- a/vllm/entrypoints/openai/tool_parsers/__init__.py
-+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
-@@ -18,6 +18,8 @@ from .mistral_tool_parser import MistralToolParser
- from .phi4mini_tool_parser import Phi4MiniJsonToolParser
- from .pythonic_tool_parser import PythonicToolParser
- from .qwen3coder_tool_parser import Qwen3CoderToolParser
-+from .seed_oss_tool_parser import SeedOssToolParser
-+# from .step3_tool_parser import Step3ToolParser
- from .xlam_tool_parser import xLAMToolParser
- 
- __all__ = [
-@@ -40,4 +42,6 @@ __all__ = [
-     "HunyuanA13BToolParser",
-     "Glm4MoeModelToolParser",
-     "Qwen3CoderToolParser",
-+    "SeedOssToolParser",
-+    # "Step3ToolParser",
- ]
-diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
-new file mode 100644
-index 000000000..69cf2e68f
---- /dev/null
-+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
-@@ -0,0 +1,676 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+# Adapted from qwen3coder xml parser, All rights reserved.
-+# ruff: noqa: E501
-+
-+import ast
-+import json
-+import uuid
-+from collections.abc import Sequence
-+from typing import Any, Optional, Union
-+
-+import regex as re
-+
-+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-+                                              ChatCompletionToolsParam,
-+                                              DeltaFunctionCall, DeltaMessage,
-+                                              DeltaToolCall,
-+                                              ExtractedToolCallInformation,
-+                                              FunctionCall, ToolCall)
-+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
-+    ToolParser, ToolParserManager)
-+from vllm.logger import init_logger
-+from vllm.transformers_utils.tokenizer import AnyTokenizer
-+
-+logger = init_logger(__name__)
-+
-+
-+@ToolParserManager.register_module("seed_oss")
-+class SeedOssToolParser(ToolParser):
-+    TOOL_CALL_START = "<seed:tool_call>"
-+    TOOL_CALL_END = "</seed:tool_call>"
-+
-+    def __init__(self, tokenizer: AnyTokenizer):
-+        super().__init__(tokenizer)
-+
-+        # --- streaming state ---
-+        self._reset_streaming_state()
-+        self.prev_tool_call_arr: list[dict] = []
-+
-+        self.tool_call_start_token: str = self.TOOL_CALL_START
-+        self.tool_call_end_token: str = self.TOOL_CALL_END
-+        # Sentinel tokens for streaming mode
-+        self.tool_call_prefix: str = "<function="
-+        self.function_end_token: str = "</function>"
-+        self.parameter_prefix: str = "<parameter="
-+        self.parameter_end_token: str = "</parameter>"
-+        self.think_start_token: str = "<seed:think>"
-+        self.think_end_token: str = "</seed:think>"
-+        self.is_tool_call_started: bool = False
-+        self.is_thinking_end: bool = False
-+        self.failed_count: int = 0
-+        self._reset_streaming_state()
-+
-+        self.tool_call_start_token_id = self.vocab.get(
-+            self.tool_call_start_token)
-+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
-+        self.think_end_token_id = self.vocab.get(self.think_end_token)
-+
-+        if (self.tool_call_start_token_id is None
-+                or self.tool_call_end_token_id is None):
-+            raise RuntimeError(
-+                "Seed_Oss XML parser: tokenizer did not include "
-+                "<seed:tool_call> or its closing tag.")
-+
-+        tool_start_re = re.escape(self.tool_call_start_token)
-+        tool_end_re = re.escape(self.tool_call_end_token)
-+
-+        self.tool_call_complete_regex = re.compile(
-+            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
-+        self.tool_call_regex = re.compile(
-+            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
-+            re.DOTALL)
-+
-+        self.tool_call_function_regex = re.compile(
-+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
-+        self.tool_call_parameter_regex = re.compile(
-+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
-+
-+        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
-+                    self.__class__.__name__)
-+
-+    def _generate_tool_call_id(self) -> str:
-+        """Generate a unique tool call ID."""
-+        return f"call_{uuid.uuid4().hex[:24]}"
-+
-+    def _reset_streaming_state(self):
-+        """Reset all streaming state."""
-+        self.current_tool_index = 0
-+        self.is_tool_call_started = False
-+        self.header_sent = False
-+        self.current_tool_id = -1
-+        self.current_function_name = None
-+        self.current_param_name = None
-+        self.current_param_value = ""
-+        self.param_count = 0
-+        self.in_param = False
-+        self.in_function = False
-+        self.accumulated_text = ""
-+        self.json_started = False
-+        self.json_closed = False
-+
-+    def _parse_xml_function_call(
-+            self, function_call_str: str,
-+            tools: Optional[list[ChatCompletionToolsParam]]
-+    ) -> Optional[ToolCall]:
-+
-+        def get_arguments_config(func_name: str) -> dict:
-+            if tools is None:
-+                return {}
-+            for config in tools:
-+                if not hasattr(config, "type") or not (
-+                        hasattr(config, "function")
-+                        and hasattr(config.function, "name")):
-+                    continue
-+                if (config.type == "function"
-+                        and config.function.name == func_name):
-+                    if not hasattr(config.function, "parameters"):
-+                        return {}
-+                    params = config.function.parameters
-+                    if isinstance(params, dict) and "properties" in params:
-+                        return params["properties"]
-+                    elif isinstance(params, dict):
-+                        return params
-+                    else:
-+                        return {}
-+            logger.warning("Tool '%s' is not defined in the tools list.",
-+                           func_name)
-+            return {}
-+
-+        def convert_param_value(param_value: str, param_name: str,
-+                                param_config: dict, func_name: str) -> Any:
-+            # Handle null value for any type
-+            if param_value.lower() == "null":
-+                return None
-+
-+            if param_name not in param_config:
-+                if param_config != {}:
-+                    logger.warning(
-+                        "Parsed parameter '%s' is not defined in "
-+                        "the tool parameters for tool '%s', "
-+                        "directly returning the string value.", param_name,
-+                        func_name)
-+                return param_value
-+
-+            if (isinstance(param_config[param_name], dict)
-+                    and "type" in param_config[param_name]):
-+                param_type = str(
-+                    param_config[param_name]["type"]).strip().lower()
-+            else:
-+                param_type = "string"
-+            if param_type in [
-+                    "string", "str", "text", "varchar", "char", "enum"
-+            ]:
-+                return param_value
-+            elif (param_type.startswith("int") or param_type.startswith("uint")
-+                  or param_type.startswith("long")
-+                  or param_type.startswith("short")
-+                  or param_type.startswith("unsigned")):
-+                try:
-+                    param_value = int(param_value)  # type: ignore
-+                except (ValueError, TypeError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
-+                        "'%s', degenerating to string.", param_value,
-+                        param_name, func_name)
-+                return param_value
-+            elif param_type.startswith("num") or param_type.startswith(
-+                    "float"):
-+                try:
-+                    float_param_value = float(param_value)
-+                    param_value = float_param_value if float_param_value - int(
-+                        float_param_value) != 0 else int(
-+                            float_param_value)  # type: ignore
-+                except (ValueError, TypeError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not a float in tool "
-+                        "'%s', degenerating to string.", param_value,
-+                        param_name, func_name)
-+                return param_value
-+            elif param_type in ["boolean", "bool", "binary"]:
-+                param_value = param_value.lower()
-+                if param_value not in ["true", "false"]:
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' is not a boolean "
-+                        "(`true` of `false`) in tool '%s', degenerating to false.",
-+                        param_value, param_name, func_name)
-+                return param_value == "true"
-+            else:
-+                if param_type == "object" or param_type.startswith("dict"):
-+                    try:
-+                        param_value = json.loads(param_value)
-+                        return param_value
-+                    except (ValueError, TypeError, json.JSONDecodeError):
-+                        logger.warning(
-+                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
-+                            "object in tool '%s', will try other methods to parse it.",
-+                            param_value, param_name, func_name)
-+                try:
-+                    param_value = ast.literal_eval(param_value)
-+                except (ValueError, SyntaxError):
-+                    logger.warning(
-+                        "Parsed value '%s' of parameter '%s' cannot be converted via "
-+                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
-+                        param_value, param_name, func_name)
-+                return param_value
-+
-+        # Extract function name
-+        end_index = function_call_str.index(">")
-+        function_name = function_call_str[:end_index]
-+        param_config = get_arguments_config(function_name)
-+        parameters = function_call_str[end_index + 1:]
-+        param_dict = {}
-+        for match in self.tool_call_parameter_regex.findall(parameters):
-+            match_text = match[0] if match[0] else match[1]
-+            idx = match_text.index(">")
-+            param_name = match_text[:idx]
-+            param_value = str(match_text[idx + 1:])
-+            # Remove prefix and trailing \n
-+            if param_value.startswith("\n"):
-+                param_value = param_value[1:]
-+            if param_value.endswith("\n"):
-+                param_value = param_value[:-1]
-+
-+            param_dict[param_name] = convert_param_value(
-+                param_value, param_name, param_config, function_name)
-+        return ToolCall(
-+            type="function",
-+            function=FunctionCall(name=function_name,
-+                                  arguments=json.dumps(param_dict,
-+                                                       ensure_ascii=False)),
++        if (
++            isinstance(processor, ProcessorMixin)
++            and hasattr(processor, "chat_template")
++            and (chat_template := processor.chat_template) is not None
++        ):
++            _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
++            return chat_template
++    except Exception:
++        logger.debug(
++            "Failed to load AutoProcessor chat template for %s",
++            tokenizer.name_or_path,
++            exc_info=True,
 +        )
 +
-+    def _get_function_calls(self, model_output: str) -> list[str]:
-+        # Find all tool calls
-+        matched_ranges = self.tool_call_regex.findall(model_output)
-+        raw_tool_calls = [
-+            match[0] if match[0] else match[1] for match in matched_ranges
-+        ]
-+
-+        # Back-off strategy if no tool_call tags found
-+        if len(raw_tool_calls) == 0:
-+            raw_tool_calls = [model_output]
-+
-+        raw_function_calls = []
-+        for tool_call in raw_tool_calls:
-+            raw_function_calls.extend(
-+                self.tool_call_function_regex.findall(tool_call))
-+
-+        function_calls = [
-+            match[0] if match[0] else match[1] for match in raw_function_calls
-+        ]
-+        return function_calls
-+
-+    def extract_tool_calls(
-+        self,
-+        model_output: str,
-+        request: ChatCompletionRequest,
-+    ) -> ExtractedToolCallInformation:
-+        # Quick check to avoid unnecessary processing
-+        if self.tool_call_prefix not in model_output:
-+            return ExtractedToolCallInformation(tools_called=False,
-+                                                tool_calls=[],
-+                                                content=model_output)
-+
-+        # Check if both think start and end tokens are present
-+        if (self.think_start_token in model_output
-+                and self.think_end_token in model_output):
-+            # Find the position of think end token
-+            think_end_index = model_output.find(self.think_end_token) + len(
-+                self.think_end_token)
-+            # Extract content after think end token
-+            result_content = model_output[think_end_index:]
-+            thinking_content = model_output[:think_end_index]
-+
-+        try:
-+            function_calls = self._get_function_calls(result_content)
-+            if len(function_calls) == 0:
-+                return ExtractedToolCallInformation(tools_called=False,
-+                                                    tool_calls=[],
-+                                                    content=model_output)
-+
-+            tool_calls = [
-+                self._parse_xml_function_call(function_call_str, request.tools)
-+                for function_call_str in function_calls
-+            ]
-+
-+            # Populate prev_tool_call_arr for serving layer to set finish_reason
-+            self.prev_tool_call_arr.clear()  # Clear previous calls
-+            for tool_call in tool_calls:
-+                if tool_call:
-+                    self.prev_tool_call_arr.append({
-+                        "name":
-+                        tool_call.function.name,
-+                        "arguments":
-+                        tool_call.function.arguments,
-+                    })
-+
-+            # Extract content before tool calls
-+            tool_call_start_index = result_content.find(
-+                self.tool_call_start_token)
-+            tool_call_start_index = (
-+                tool_call_start_index if tool_call_start_index >= 0 else
-+                result_content.find(self.tool_call_prefix))
-+            content = thinking_content + result_content[:tool_call_start_index]
-+
-+            return ExtractedToolCallInformation(
-+                tools_called=(len(tool_calls) > 0),
-+                tool_calls=tool_calls,
-+                content=content if content else None,
-+            )
-+
-+        except Exception:
-+            logger.exception("Error in extracting tool call from response.")
-+            return ExtractedToolCallInformation(tools_called=False,
-+                                                tool_calls=[],
-+                                                content=model_output)
-+
-+    def extract_tool_calls_streaming(
-+        self,
-+        previous_text: str,
-+        current_text: str,
-+        delta_text: str,
-+        previous_token_ids: Sequence[int],
-+        current_token_ids: Sequence[int],
-+        delta_token_ids: Sequence[int],
-+        request: ChatCompletionRequest,
-+    ) -> Union[DeltaMessage, None]:
-+        # If no delta text, return None unless
-+        # it's an EOS token after tool calls
-+        if not delta_text:
-+            # Check if this is an EOS token after all tool calls are complete
-+            # We check for tool calls in the text even if is_tool_call_started
-+            # is False because it might have been reset after processing all tools
-+            if (delta_token_ids
-+                    and self.tool_call_end_token_id not in delta_token_ids):
-+                # Count complete tool calls
-+                complete_calls = len(
-+                    self.tool_call_complete_regex.findall(current_text))
-+
-+                # If we have completed tool calls and populated prev_tool_call_arr
-+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
-+                    # Check if all tool calls are closed
-+                    open_calls = current_text.count(
-+                        self.tool_call_start_token) - current_text.count(
-+                            self.tool_call_end_token)
-+                    if open_calls == 0:
-+                        # Return empty delta message to allow finish_reason processing
-+                        return DeltaMessage(content="")
-+                elif not self.is_tool_call_started and current_text:
-+                    # This is a regular content response that's now complete
-+                    return DeltaMessage(content="")
-+            return None
-+
-+        # Check if this is the first call (reset state if needed)
-+        if not previous_text:
-+            self._reset_streaming_state()
-+
-+        # Update accumulated text
-+        self.accumulated_text = current_text
-+
-+        # Check if we need to advance to next tool
-+        if self.json_closed and not self.in_function:
-+            # Check if this tool call has ended
-+            tool_ends = current_text.count(self.tool_call_end_token)
-+            if tool_ends > self.current_tool_index:
-+                # This tool has ended, advance to next
-+                self.current_tool_index += 1
-+                self.header_sent = False
-+                self.param_count = 0
-+                self.json_started = False
-+                self.json_closed = False
-+
-+                # Check if there are more tool calls
-+                if self.current_tool_index >= current_text.count(
-+                        self.tool_call_start_token):
-+                    # No more tool calls
-+                    self.is_tool_call_started = False
-+                # Continue processing next tool
-+                return None
-+
-+        # Check if end thinking
-+        if (not self.is_thinking_end
-+                and (self.think_end_token_id in delta_token_ids
-+                     or self.think_end_token in delta_text)):
-+            self.is_thinking_end = True
-+
-+        # If thinking hasn't ended yet, don't process any tool calls
-+        if not self.is_thinking_end:
-+            return DeltaMessage(content=delta_text)
-+
-+        # Handle normal content before tool calls
-+        if not self.is_tool_call_started:
-+            # Check if tool call is starting
-+            if (self.tool_call_start_token_id in delta_token_ids
-+                    or self.tool_call_start_token in delta_text):
-+                self.is_tool_call_started = True
-+                # Return any content before the tool call
-+                if self.tool_call_start_token in delta_text:
-+                    content_before = delta_text[:delta_text.index(
-+                        self.tool_call_start_token)]
-+                    if content_before:
-+                        return DeltaMessage(content=content_before)
-+                return None
-+            else:
-+                # Check if we're between tool calls - skip whitespace
-+                if (current_text.rstrip().endswith(self.tool_call_end_token)
-+                        and delta_text.strip() == ""):
-+                    # We just ended a tool call, skip whitespace
-+                    return None
-+                # Normal content, no tool call
-+                return DeltaMessage(content=delta_text)
-+
-+        # Check if we're between tool calls (waiting for next one)
-+        # Count tool calls we've seen vs processed
-+        tool_starts_count = current_text.count(self.tool_call_start_token)
-+        if self.current_tool_index >= tool_starts_count:
-+            # We're past all tool calls, shouldn't be here
-+            return None
-+
-+        # We're in a tool call, find the current tool call portion
-+        # Need to find the correct tool call based on current_tool_index
-+        # Only process tool calls after think_end_token
-+        think_end_index = current_text.find(self.think_end_token) + len(
-+            self.think_end_token
-+        ) if self.think_end_token in current_text else 0
-+        tool_starts: list[int] = []
-+        idx = think_end_index
-+        while True:
-+            idx = current_text.find(self.tool_call_start_token, idx)
-+            if idx == -1:
-+                break
-+            tool_starts.append(idx)
-+            idx += len(self.tool_call_start_token)
-+
-+        if self.current_tool_index >= len(tool_starts):
-+            # No more tool calls to process yet
-+            return None
-+
-+        tool_start_idx = tool_starts[self.current_tool_index]
-+        # Find where this tool call ends (or current position if not ended yet)
-+        tool_end_idx = current_text.find(self.tool_call_end_token,
-+                                         tool_start_idx)
-+        if tool_end_idx == -1:
-+            tool_text = current_text[tool_start_idx:]
-+        else:
-+            tool_text = current_text[tool_start_idx:tool_end_idx +
-+                                     len(self.tool_call_end_token)]
-+
-+        # Looking for function header
-+        if not self.header_sent:
-+            if self.tool_call_prefix in tool_text:
-+                func_start = tool_text.find(self.tool_call_prefix) + len(
-+                    self.tool_call_prefix)
-+                func_end = tool_text.find(">", func_start)
-+
-+                if func_end != -1:
-+                    # Found complete function name
-+                    self.current_function_name = tool_text[func_start:func_end]
-+                    self.current_tool_id = self._generate_tool_call_id(
-+                    )  # type: ignore
-+                    self.header_sent = True
-+                    self.in_function = True
-+
-+                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
-+                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
-+                    already_added = any(
-+                        tool.get("name") == self.current_function_name
-+                        for tool in self.prev_tool_call_arr)
-+                    if not already_added:
-+                        self.prev_tool_call_arr.append({
-+                            "name": self.current_function_name,
-+                            "arguments":
-+                            "{}",  # Placeholder, will be updated later
-+                        })
-+
-+                    # Send header with function info
-+                    return DeltaMessage(tool_calls=[
-+                        DeltaToolCall(
-+                            index=self.current_tool_index,
-+                            id=self.current_tool_id,
-+                            function=DeltaFunctionCall(
-+                                name=self.current_function_name, arguments=""),
-+                            type="function",
-+                        )
-+                    ])
-+            return None
-+
-+        # We've sent header, now handle function body
-+        if self.in_function:
-+            # Send opening brace if not sent yet
-+            if (not self.json_started
-+                    and self.parameter_prefix not in delta_text):
-+                self.json_started = True
-+                return DeltaMessage(tool_calls=[
-+                    DeltaToolCall(
-+                        index=self.current_tool_index,
-+                        function=DeltaFunctionCall(arguments="{"),
-+                    )
-+                ])
-+
-+            # Make sure json_started is set if we're processing parameters
-+            if not self.json_started:
-+                self.json_started = True
-+
-+            # Check for function end in accumulated text
-+            if not self.json_closed and self.function_end_token in tool_text:
-+                # Close JSON
-+                self.json_closed = True
-+
-+                # Extract the complete tool call to update prev_tool_call_arr with final arguments
-+                # Find the function content
-+                func_start = tool_text.find(self.tool_call_prefix) + len(
-+                    self.tool_call_prefix)
-+                func_content_end = tool_text.find(self.function_end_token,
-+                                                  func_start)
-+                if func_content_end != -1:
-+                    func_content = tool_text[func_start:func_content_end]
-+                    # Parse to get the complete arguments
-+                    try:
-+                        parsed_tool = self._parse_xml_function_call(
-+                            func_content, request.tools if request else None)
-+                        if parsed_tool:
-+                            # Update existing entry in prev_tool_call_arr with complete arguments
-+                            for i, tool in enumerate(self.prev_tool_call_arr):
-+                                if tool.get(
-+                                        "name") == parsed_tool.function.name:
-+                                    self.prev_tool_call_arr[i]["arguments"] = (
-+                                        parsed_tool.function.arguments)
-+                                    break
-+                    except Exception:
-+                        logger.warning(
-+                            "Failed to parse tool arguments during streaming.",
-+                            exc_info=True)
-+
-+                result = DeltaMessage(tool_calls=[
-+                    DeltaToolCall(
-+                        index=self.current_tool_index,
-+                        function=DeltaFunctionCall(arguments="}"),
-+                    )
-+                ])
-+
-+                # Reset state for next tool
-+                self.in_function = False
-+                self.json_closed = True
-+
-+                return result
-+
-+            # Look for parameters
-+            # Count how many complete parameters we have processed
-+            complete_params = tool_text.count(self.parameter_end_token)
-+
-+            # Check if we should start a new parameter
-+            if not self.in_param and self.param_count < complete_params:
-+                # Find the unprocessed parameter
-+                # Count parameter starts
-+                param_starts = []
-+                idx = 0
-+                while True:
-+                    idx = tool_text.find(self.parameter_prefix, idx)
-+                    if idx == -1:
-+                        break
-+                    param_starts.append(idx)
-+                    idx += len(self.parameter_prefix)
-+
-+                if len(param_starts) > self.param_count:
-+                    # Process the next parameter
-+                    param_idx = param_starts[self.param_count]
-+                    param_start = param_idx + len(self.parameter_prefix)
-+                    remaining = tool_text[param_start:]
-+
-+                    if ">" in remaining:
-+                        # We have the complete parameter name
-+                        name_end = remaining.find(">")
-+                        self.current_param_name = remaining[:name_end]
-+
-+                        # Find the parameter value
-+                        value_start = param_start + name_end + 1
-+                        value_text = tool_text[value_start:]
-+                        if value_text.startswith("\n"):
-+                            value_text = value_text[1:]
-+
-+                        # Find where this parameter ends
-+                        param_end_idx = value_text.find(
-+                            self.parameter_end_token)
-+                        if param_end_idx != -1:
-+                            # Complete parameter found
-+                            param_value = value_text[:param_end_idx]
-+                            if param_value.endswith("\n"):
-+                                param_value = param_value[:-1]
-+
-+                            # Build complete JSON fragment for this parameter
-+                            if self.param_count == 0:
-+                                json_fragment = (
-+                                    '"' + self.current_param_name + '": "' +
-+                                    json.dumps(param_value)[1:-1] + '"')
-+                            else:
-+                                json_fragment = (
-+                                    ', "' + self.current_param_name + '": "' +
-+                                    json.dumps(param_value)[1:-1] + '"')
-+
-+                            self.param_count += 1
-+
-+                            return DeltaMessage(tool_calls=[
-+                                DeltaToolCall(
-+                                    index=self.current_tool_index,
-+                                    function=DeltaFunctionCall(
-+                                        arguments=json_fragment),
-+                                )
-+                            ])
-+
-+            # Continue parameter value
-+            if self.in_param:
-+                if self.parameter_end_token in delta_text:
-+                    # End of parameter
-+                    end_idx = delta_text.find(self.parameter_end_token)
-+                    value_chunk = delta_text[:end_idx]
-+
-+                    # Skip past > if at start
-+                    if not self.current_param_value and ">" in value_chunk:
-+                        gt_idx = value_chunk.find(">")
-+                        value_chunk = value_chunk[gt_idx + 1:]
-+
-+                    if not self.current_param_value and value_chunk.startswith(
-+                            "\n"):
-+                        value_chunk = value_chunk[1:]
-+
-+                    # Calculate incremental JSON
-+                    full_value = self.current_param_value + value_chunk
-+                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-+                                    if self.current_param_value else "")
-+                    full_escaped = json.dumps(full_value)[1:-1]
-+                    delta_escaped = full_escaped[len(prev_escaped):]
-+
-+                    self.in_param = False
-+                    self.current_param_value = ""
-+
-+                    return DeltaMessage(tool_calls=[
-+                        DeltaToolCall(
-+                            index=self.current_tool_index,
-+                            function=DeltaFunctionCall(
-+                                arguments=delta_escaped + '"'),
-+                        )
-+                    ])
-+                else:
-+                    # Continue accumulating value
-+                    value_chunk = delta_text
-+
-+                    # Handle first chunk after param name
-+                    if not self.current_param_value and ">" in value_chunk:
-+                        gt_idx = value_chunk.find(">")
-+                        value_chunk = value_chunk[gt_idx + 1:]
-+
-+                    if not self.current_param_value and value_chunk.startswith(
-+                            "\n"):
-+                        value_chunk = value_chunk[1:]
-+
-+                    if value_chunk:
-+                        # Stream the escaped delta
-+                        prev_escaped = (json.dumps(
-+                            self.current_param_value)[1:-1]
-+                                        if self.current_param_value else "")
-+                        self.current_param_value += value_chunk
-+                        full_escaped = json.dumps(
-+                            self.current_param_value)[1:-1]
-+                        delta_escaped = full_escaped[len(prev_escaped):]
-+
-+                        if delta_escaped:
-+                            return DeltaMessage(tool_calls=[
-+                                DeltaToolCall(
-+                                    index=self.current_tool_index,
-+                                    function=DeltaFunctionCall(
-+                                        arguments=delta_escaped),
-+                                )
-+                            ])
-+
-+        return None
++    _PROCESSOR_CHAT_TEMPLATES[cache_key] = None
++    return None
++
++
+ def resolve_hf_chat_template(
+     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+     chat_template: Optional[str],
+@@ -444,28 +489,10 @@ def resolve_hf_chat_template(
+ 
+     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
+     if tools is None:
+-        try:
+-            processor = cached_get_processor(
+-                tokenizer.name_or_path,
+-                processor_cls=(
+-                    PreTrainedTokenizer,
+-                    PreTrainedTokenizerFast,
+-                    ProcessorMixin,
+-                ),
+-                trust_remote_code=model_config.trust_remote_code,
+-            )
+-            if (
+-                isinstance(processor, ProcessorMixin)
+-                and hasattr(processor, "chat_template")
+-                and processor.chat_template is not None
+-            ):
+-                return processor.chat_template
+-        except Exception:
+-            logger.debug(
+-                "Failed to load AutoProcessor chat template for %s",
+-                tokenizer.name_or_path,
+-                exc_info=True,
+-            )  # noqa: E501
++        chat_template = _try_get_processor_chat_template(tokenizer,
++                                                         model_config)
++        if chat_template is not None:
++            return chat_template
+ 
+     # 3rd priority: AutoTokenizer chat template
+     try:
 diff --git a/vllm/envs.py b/vllm/envs.py
-index 5c414e82d..56a8d7253 100755
+index ac770ac4c..c20997bbb 100755
 --- a/vllm/envs.py
 +++ b/vllm/envs.py
-@@ -143,6 +143,10 @@ if TYPE_CHECKING:
-     VLLM_USE_CUDNN_PREFILL: bool = False
-     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
-     VLLM_LOOPBACK_IP: str = ""
+@@ -70,7 +70,6 @@ if TYPE_CHECKING:
+     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+     VLLM_MM_INPUT_CACHE_GIB: int = 4
+     VLLM_TARGET_DEVICE: str = "cuda"
+-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+     MAX_JOBS: Optional[str] = None
+     NVCC_THREADS: Optional[str] = None
+     VLLM_USE_PRECOMPILED: bool = False
+@@ -176,6 +175,9 @@ if TYPE_CHECKING:
+     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
+     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
+     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
 +    VLLM_XPU_FP8_DTYPE: str = "e5m2"
-+    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = False
-+    CCL_P2P_CPU: bool = False
++    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = True
 +    VLLM_QUANTIZE_Q40_LIB: str = "/opt/lib/vllm_int4_for_multi_arc.so"
  
  
  def get_default_cache_root():
-@@ -985,6 +989,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
-     # Used to force set up loopback IP
-     "VLLM_LOOPBACK_IP":
-     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
+@@ -247,11 +249,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
+     "VLLM_TARGET_DEVICE":
+     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
+ 
+-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+-    # 12.8 is the default. This follows PyTorch but can be overridden.
+-    "VLLM_MAIN_CUDA_VERSION":
+-    lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8",
+-
+     # Maximum number of compilation jobs to run in parallel.
+     # By default this is the number of CPUs
+     "MAX_JOBS":
+@@ -1247,6 +1244,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
+     # raw bytes. Defaults to True for backward compatibility.
+     "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES":
+     lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))),
 +
 +    # fp8 dtype for XPU platform
 +    "VLLM_XPU_FP8_DTYPE":
@@ -11850,475 +9434,222 @@ index 5c414e82d..56a8d7253 100755
 +
 +    # Offload model weights to cpu before online fp8 quantization
 +    "VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT":
-+    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "0") == "1",
++    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "1") == "1",
 +
 +    # Path for finding libs for vLLM sym_int4 quantization support
 +    "VLLM_QUANTIZE_Q40_LIB":
 +    lambda: os.environ.get("VLLM_QUANTIZE_Q40_LIB", "/opt/lib/vllm_int4_for_multi_arc.so"),
 +
-+    # Do send/recv on CPU backend
-+    "CCL_P2P_CPU":
-+    lambda: os.environ.get("CCL_P2P_CPU", "1") == "1"
  }
  
  # --8<-- [end:env-vars-definition]
-diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
-index e9ad62aeb..acfdcfa96 100644
---- a/vllm/executor/ray_distributed_executor.py
-+++ b/vllm/executor/ray_distributed_executor.py
-@@ -84,15 +84,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
-         # be executed in a remote Ray worker. Currently this requires
-         # USE_RAY_COMPILED_DAG=True.
-         self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
--        if self.use_ray_compiled_dag:
--            assert self.use_ray_spmd_worker, (
--                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
--                "VLLM_USE_RAY_SPMD_WORKER=1")
--        if self.use_ray_spmd_worker:
--            # TODO: Support SPMD worker for non-DAG Ray executor.
--            assert self.use_ray_compiled_dag, (
--                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
--                "VLLM_USE_RAY_COMPILED_DAG=1")
- 
-         assert self.uses_ray
-         initialize_ray_cluster(self.parallel_config)
-@@ -112,7 +103,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
-         self.use_v1 = envs.VLLM_USE_V1
- 
-         self.pp_locks: Optional[List[asyncio.Lock]] = None
--        if not self.use_ray_compiled_dag:
-+        if not self.use_ray_spmd_worker:
-             self.driver_exec_method = make_async(
-                 self.driver_worker.execute_method)
- 
-diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
-index 2db0e9fee..b25fcdb87 100644
---- a/vllm/lora/punica_wrapper/punica_gpu.py
-+++ b/vllm/lora/punica_wrapper/punica_gpu.py
-@@ -7,17 +7,36 @@ Punica: Multi-Tenant LoRA Serving.
- https://arxiv.org/abs/2310.18547
- """
+diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
+index a90a71159..5638da392 100644
+--- a/vllm/model_executor/layers/fused_moe/layer.py
++++ b/vllm/model_executor/layers/fused_moe/layer.py
+@@ -601,7 +601,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+                 logical_replica_count is not None:
+             raise NotImplementedError("Expert load balancing is not supported "
+                                       "for XPU.")
+-        assert custom_routing_function is None
+         return layer.ipex_fusion(
+             x,
+             use_grouped_topk,
+@@ -610,6 +609,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+             renormalize,
+             topk_group,
+             num_expert_group,
++            custom_routing_function=custom_routing_function
+         )
  
--from typing import Optional, Union, final
-+from typing import Optional, Union, Tuple, final
-+import os
+     def forward_tpu(
+diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
+index f875f712b..687629720 100644
+--- a/vllm/model_executor/layers/layernorm.py
++++ b/vllm/model_executor/layers/layernorm.py
+@@ -243,6 +243,7 @@ class RMSNorm(CustomOp):
+         self,
+         x: torch.Tensor,
+         residual: Optional[torch.Tensor] = None,
++        origin: bool = False,
+     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+         if self.variance_size_override is not None:
+             return self.forward_native(x, residual)
+@@ -257,6 +258,12 @@ class RMSNorm(CustomOp):
+                 self.variance_epsilon,
+             )
+             return x, residual
++        if origin:
++            return ops.rms_norm_origin(
++                x,
++                self.weight.data,
++                self.variance_epsilon,
++            )
+         return ops.rms_norm(
+             x,
+             self.weight.data,
+diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
+index 8cac47b5a..47577ff9c 100644
+--- a/vllm/model_executor/layers/quantization/__init__.py
++++ b/vllm/model_executor/layers/quantization/__init__.py
+@@ -35,6 +35,7 @@ QuantizationMethods = Literal[
+     "inc",
+     "mxfp4",
+     "petit_nvfp4",
++    "sym_int4"
+ ]
+ QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
  
- import torch
+@@ -112,6 +113,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+     from .rtn import RTNConfig
+     from .torchao import TorchAOConfig
+     from .tpu_int8 import Int8TpuConfig
++    from .sym_int4 import SymInt4Config
  
- import vllm.envs as envs
- from vllm.lora.layers import LoRAMapping
-+from vllm.platforms import current_platform
- from vllm.triton_utils import HAS_TRITON
+     method_to_config: dict[str, type[QuantizationConfig]] = {
+         "awq": AWQConfig,
+@@ -142,6 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
+         "inc": INCConfig,
+         "mxfp4": Mxfp4Config,
+         "petit_nvfp4": PetitNvFp4Config,
++        "sym_int4": SymInt4Config,
+     }
+     # Update the `method_to_config` with customized quantization methods.
+     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
+diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
+index 3d94626e5..7a85c7490 100644
+--- a/vllm/model_executor/layers/quantization/fp8.py
++++ b/vllm/model_executor/layers/quantization/fp8.py
+@@ -143,6 +143,7 @@ class Fp8Config(QuantizationConfig):
+         from vllm.attention.layer import Attention
+         from vllm.model_executor.layers.quantization.ipex_quant import (
+             XPUFp8LinearMethod, XPUFp8MoEMethod)
++        modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
+         fp8_config = Fp8Config(
+             is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
+             activation_scheme=self.activation_scheme,
+@@ -152,7 +153,7 @@ class Fp8Config(QuantizationConfig):
+         if isinstance(layer, LinearBase):
+             if is_layer_skipped(prefix=prefix,
+                                 ignored_layers=self.ignored_layers,
+-                                fused_mapping=self.packed_modules_mapping):
++                                fused_mapping=self.packed_modules_mapping) or any(key in prefix for key in modules_to_not_convert):
+                 return UnquantizedLinearMethod()
+             return XPUFp8LinearMethod(fp8_config)
+         elif isinstance(layer, FusedMoE):
+@@ -309,10 +310,14 @@ class Fp8LinearMethod(LinearMethodBase):
+                         if self.quant_config.is_checkpoint_fp8_serialized else
+                         params_dtype)
  
--if HAS_TRITON:
-+is_xpu = current_platform.is_xpu()
-+xpu_use_triton_kernels = os.getenv("XPU_USE_TRITON_KERNELS", "0") == "1"
++        # Force offloading weights to cpu if VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
++        # enabled, otherwise use original device config which can be gpu or cpu
++        # (may happen when cpu_offload_gb > 0)
+         weight = ModelWeightParameter(data=torch.empty(
+             output_size_per_partition,
+             input_size_per_partition,
+-            dtype=weight_dtype),
++            dtype=weight_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                       input_dim=1,
+                                       output_dim=0,
+                                       weight_loader=weight_loader)
+@@ -631,8 +636,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+             num_experts,
+             2 * intermediate_size_per_partition,
+             hidden_size,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                         requires_grad=False)
 +
-+if is_xpu and not xpu_use_triton_kernels:
-+    from vllm._ipex_ops import bgmv_expand_slice, bgmv_shrink, ipex_ops
-+    try:
-+        lora_expand = ipex_ops.lora_expand
-+        lora_shrink = ipex_ops.lora_shrink
-+        XPU_KERNEL_V = 1
-+    except AttributeError:
-+        bgmv_expand = ipex_ops.bgmv_expand
-+        sgmv_expand = ipex_ops.sgmv_expand
-+        sgmv_expand_slice = ipex_ops.sgmv_expand_slice
-+        sgmv_shrink = ipex_ops.sgmv_shrink
-+        XPU_KERNEL_V = 0
-+elif HAS_TRITON:
-     from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
-                                           lora_shrink)
-+    if is_xpu:
-+        XPU_KERNEL_V = 1
- 
- from .punica_base import PunicaWrapperBase
+         layer.register_parameter("w13_weight", w13_weight)
+         set_weight_attrs(w13_weight, extra_weight_attrs)
  
-@@ -37,9 +56,9 @@ class PunicaWrapperGPU(PunicaWrapperBase):
+@@ -640,7 +647,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+             num_experts,
+             hidden_size,
+             intermediate_size_per_partition,
+-            dtype=params_dtype),
++            dtype=params_dtype,
++            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
+                                        requires_grad=False)
+         layer.register_parameter("w2_weight", w2_weight)
+         set_weight_attrs(w2_weight, extra_weight_attrs)
+diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
+index 5f9d48142..6364d5cf5 100644
+--- a/vllm/model_executor/layers/quantization/ipex_quant.py
++++ b/vllm/model_executor/layers/quantization/ipex_quant.py
+@@ -9,6 +9,7 @@ from torch.nn import Module
+ from torch.nn.parameter import Parameter
  
-         self.max_loras = kwargs['max_loras']
+ from vllm._ipex_ops import ipex_ops as ops
++import vllm.envs as envs
+ from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
+                                                   FusedMoeWeightScaleSupported)
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+@@ -45,6 +46,7 @@ class IPEXConfig(QuantizationConfig):
+         modules_to_not_convert: Optional[list[str]] = None,
+         desc_act: Optional[bool] = None,
+         lm_head_quantized: Optional[bool] = None,
++        is_qweight_sym: Optional[bool] = None,
+     ) -> None:
+         super().__init__()
+         self.method = method
+@@ -62,6 +64,7 @@ class IPEXConfig(QuantizationConfig):
+         if self.method not in ["awq", "gptq"]:
+             raise ValueError(f"IPEX quantization supports [awq, gptq], "
+                              f"but got {self.method}.")
++        self.is_qweight_sym = is_qweight_sym
  
--        self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
--                                                      max_num_batched_tokens,
--                                                      device=device)
-+        if not (is_xpu and XPU_KERNEL_V == 0):
-+            self.token_mapping_meta = LoRAKernelMeta.make(
-+                self.max_loras, max_num_batched_tokens, device=device)
- 
-         # When cudagraph capture size is greater than max_num_seqs (max_batches,
-         # here), V0 captures the graph as if max_num_seqs is set to
-@@ -47,21 +66,91 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-         # V1 doesn't have this problem and always respects max_num_seqs.
-         max_num_prompts = (max_batches
-                            if envs.VLLM_USE_V1 else max_num_batched_tokens)
--        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
--                                                       max_num_prompts,
--                                                       device=device)
-+        if not (is_xpu and XPU_KERNEL_V == 0):
-+            self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-+                                                           max_num_prompts,
-+                                                           device=device)
- 
-     def update_metadata(self, mapping: LoRAMapping,
-                         lora_index_to_id: list[Optional[int]], max_loras: int,
-                         vocab_size: int, extra_vocab_size: int, **kwargs):
- 
-         self.is_prefill = mapping.is_prefill
--        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
--                                   vocab_size, extra_vocab_size)
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            PunicaWrapperBase.update_metadata(self, mapping, lora_index_to_id,
-+                                              max_loras, vocab_size,
-+                                              extra_vocab_size,
-+                                              **kwargs)
-+        else:
-+            self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-+                                       vocab_size, extra_vocab_size)
-+            # Prepare cuda kernel metadata tensors
-+            self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
-+            self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
-+
-+    def _apply_shrink_prefill(
-+        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: Tuple[torch.Tensor, ...],
-+        scale: float,
-+    ):
-+        #No LoRA request, so return directly
-+        if self.no_lora:
-+            return
-+        sgmv_shrink(
-+            x,
-+            w_t_all,
-+            y,
-+            *self.prefill_metadata,
-+            scale,
-+        )
-+
-+    def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
-+        return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
-+
-+    def _apply_shrink_decode(
-+        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: torch.Tensor,
-+        scale: float,
-+    ):
-+        bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), scale)
-+
-+    def _apply_expand_prefill(
-+        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: Tuple[torch.Tensor, ...],
-+        offset_start: int,
-+        add_inputs: bool,
-+    ):
-+        #No LoRA request, so return directly
-+        if self.no_lora:
-+            return
-+
-+        sgmv_expand(
-+            x,
-+            w_t_all,
-+            y,
-+            *self.prefill_metadata,
-+            offset_start=offset_start,
-+            add_inputs=add_inputs,
-+        )
- 
--        # Prepare cuda kernel metadata tensors
--        self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
--        self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
-+    def _apply_expand_decode(
-+        self,
-+        y: torch.Tensor,
-+        x: torch.Tensor,
-+        w_t_all: torch.Tensor,
-+        y_offset: Optional[int],
-+        y_slice_size: Optional[int],
-+        add_inputs: bool,
-+    ):
-+        token_lora_indices = self._get_token_lora_indices(x)
-+        bgmv_expand_slice(x, w_t_all, y, token_lora_indices, y_offset,
-+                          y_slice_size, add_inputs)
- 
-     def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                    lora_a_stacked: tuple[torch.Tensor,
-@@ -81,13 +170,20 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-         """
- 
-         x = x.view(-1, x.shape[-1])
--        lora_shrink(
--            x,
--            lora_a_stacked,
--            y,
--            *self.token_mapping_meta.meta_args(x.size(0)),
--            scale,
--        )
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            for slice_idx in range(len(lora_a_stacked)):
-+                self._apply_shrink_decode(y[slice_idx], x,
-+                                          lora_a_stacked[slice_idx], scale)
-+        else:
-+            meta_args = self.token_mapping_meta.meta_args(x.size(0))
-+
-+            lora_shrink(
-+                x,
-+                lora_a_stacked,
-+                y,
-+                *self.token_mapping_meta.meta_args(x.size(0)),
-+                scale,
-+            )
- 
-     def add_expand(self,
-                    y: torch.Tensor,
-@@ -127,17 +223,29 @@ class PunicaWrapperGPU(PunicaWrapperBase):
- 
-         assert x.ndim == 3
-         assert x.size(0) == len(output_slices)
--        num_tokens = x.size(1)  # first dimension is the num slices
--
--        lora_expand(
--            x,
--            lora_b_stacked,
--            y,
--            *self.token_mapping_meta.meta_args(num_tokens),
--            offset_start=offset_start,
--            add_inputs=True,
--        )
- 
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            # TODO fuse these kernels
-+            for slice_idx in range(len(lora_b_stacked)):
-+                self._apply_expand_decode(
-+                    y,
-+                    x[slice_idx],
-+                    lora_b_stacked[slice_idx],
-+                    offset_start,
-+                    output_slices[slice_idx],
-+                    add_inputs=add_inputs,
-+                )
-+                offset_start += output_slices[slice_idx]
-+        else:
-+            num_tokens = x.size(1)  # first dimension is the num slices
-+            lora_expand(
-+                x,
-+                lora_b_stacked,
-+                y,
-+                *self.token_mapping_meta.meta_args(num_tokens),
-+                offset_start=offset_start,
-+                add_inputs=True,
-+            )
-         y = y.view_as(y_org)
- 
-     def add_lora_embedding(self,
-@@ -159,14 +267,18 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-             add_inputs (bool): Default to True.
-         """
- 
--        lora_expand(
--            x.unsqueeze(dim=0),
--            (lora_b_stacked, ),
--            y,
--            *self.token_mapping_meta.meta_args(x.size(0)),
--            offset_start=0,
--            add_inputs=add_inputs,
--        )
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            bgmv_expand(x, lora_b_stacked, y, self._get_token_lora_indices(x),
-+                        add_inputs)
-+        else:
-+            lora_expand(
-+                x.unsqueeze(dim=0),
-+                (lora_b_stacked, ),
-+                y,
-+                *self.token_mapping_meta.meta_args(x.size(0)),
-+                offset_start=0,
-+                add_inputs=add_inputs,
-+            )
- 
-     def add_lora_linear(self,
-                         y: torch.Tensor,
-@@ -269,11 +381,19 @@ class PunicaWrapperGPU(PunicaWrapperBase):
-                                  dtype=torch.float32,
-                                  device=x.device)
- 
--        lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
--                    *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
--
--        lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
--                    y,
--                    *self.prompt_mapping_meta.meta_args(buffer.size(0)),
--                    add_inputs=True)
-+        if is_xpu and XPU_KERNEL_V == 0:
-+            bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
-+            bgmv_expand(buffer,
-+                        lora_b_stacked,
-+                        y,
-+                        self.sampler_indices,
-+                        add_inputs=True)
-+        else:
-+            lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
-+                        *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
-+
-+            lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
-+                        y,
-+                        *self.prompt_mapping_meta.meta_args(buffer.size(0)),
-+                        add_inputs=True)
-         y = y.view_as(y_org)
-diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
-index 4a6a3b95e..6785373ae 100644
---- a/vllm/model_executor/layers/fused_moe/layer.py
-+++ b/vllm/model_executor/layers/fused_moe/layer.py
-@@ -327,7 +327,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-             layer.w13_weight.data = shuffled_w13
-             layer.w2_weight.data = shuffled_w2
+     def __repr__(self) -> str:
+         return (f"IPEXConfig(method={self.method},"
+@@ -96,16 +99,18 @@ class IPEXConfig(QuantizationConfig):
+                                            ["q_group_size", "group_size"])
+             modules_to_not_convert = cls.get_from_keys_or(
+                 config, ["modules_to_not_convert"], None)
++            is_qweight_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
+             return cls(method, weight_bits, group_size, modules_to_not_convert,
+-                       False, False)
++                       False, False, is_qweight_sym)
+         # otherwise for gptq
+         weight_bits = cls.get_from_keys(config, ["bits"])
+         group_size = cls.get_from_keys(config, ["group_size"])
+         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                  default=False)
+         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
++        is_qweight_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+         return cls(method, weight_bits, group_size, [], desc_act,
+-                   lm_head_quantized)
++                   lm_head_quantized, is_qweight_sym)
  
--        if current_platform.is_cpu():
-+        if current_platform.is_xpu():
-+            import intel_extension_for_pytorch as ipex
-+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-+                layer.w13_weight,
-+                layer.w2_weight,
-+                use_prepack=True,
-+            )
-+        elif current_platform.is_cpu():
-             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                 from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-                 dtype = layer.w13_weight.dtype
-@@ -501,6 +508,29 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-             activation,
+     @classmethod
+     def override_quantization_method(
+@@ -183,7 +188,8 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
+             g_idx=g_idx,
+             bias=bias,
+             group_size=self.quant_config.group_size,
+-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
++            weight_qscheme="sym" if self.quant_config.is_qweight_sym else "asym",
          )
  
-+    def forward_xpu(
-+        self,
-+        layer: torch.nn.Module,
-+        x: torch.Tensor,
-+        use_grouped_topk: bool,
-+        top_k: int,
-+        router_logits: torch.Tensor,
-+        renormalize: bool,
-+        topk_group: Optional[int] = None,
-+        num_expert_group: Optional[int] = None,
-+        custom_routing_function: Optional[Callable] = None,
-+        **kwargs,
-+    ):
-+        return layer.ipex_fusion(
-+            x,
-+            use_grouped_topk,
-+            top_k,
-+            router_logits,
-+            renormalize,
-+            topk_group,
-+            num_expert_group,
-+            custom_routing_function=custom_routing_function)
-+
-     def forward_tpu(
-         self,
-         layer: torch.nn.Module,
-@@ -544,6 +574,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-         forward_native = forward_tpu
-     elif current_platform.is_cpu():
-         forward_native = forward_cpu
-+    elif current_platform.is_xpu():
-+        forward_native = forward_xpu
-     else:
-         forward_native = forward_cuda
- 
-diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
-index 95aea912a..f9dab92f2 100644
---- a/vllm/model_executor/layers/quantization/__init__.py
-+++ b/vllm/model_executor/layers/quantization/__init__.py
-@@ -37,6 +37,7 @@ QuantizationMethods = Literal[
-     "auto-round",
-     "rtn",
-     "inc",
-+    "sym_int4"
- ]
- QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
- 
-@@ -116,6 +117,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
-     from .rtn import RTNConfig
-     from .torchao import TorchAOConfig
-     from .tpu_int8 import Int8TpuConfig
-+    from .sym_int4 import SymInt4Config
- 
-     method_to_config: dict[str, type[QuantizationConfig]] = {
-         "aqlm": AQLMConfig,
-@@ -148,6 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
-         "auto-round": AutoRoundConfig,
-         "rtn": RTNConfig,
-         "inc": INCConfig,
-+        "sym_int4": SymInt4Config,
-     }
-     # Update the `method_to_config` with customized quantization methods.
-     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
-diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
-index 75f8adf34..73f8488dc 100644
---- a/vllm/model_executor/layers/quantization/fp8.py
-+++ b/vllm/model_executor/layers/quantization/fp8.py
-@@ -197,7 +197,7 @@ class Fp8LinearMethod(LinearMethodBase):
-         self.use_marlin = (not current_platform.has_device_capability(89)
-                            or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-         # Disable marlin for rocm
--        if current_platform.is_rocm():
-+        if current_platform.is_rocm() or current_platform.is_xpu():
-             self.use_marlin = False
- 
-         # AITER is only supported on ROCm and only for FP8_FNUZ
-@@ -278,10 +278,14 @@ class Fp8LinearMethod(LinearMethodBase):
-                         if self.quant_config.is_checkpoint_fp8_serialized else
-                         params_dtype)
- 
-+        # Force offloading weights to cpu if VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
-+        # enabled, otherwise use original device config which can be gpu or cpu
-+        # (may happen when cpu_offload_gb > 0)
-         weight = ModelWeightParameter(data=torch.empty(
-             output_size_per_partition,
-             input_size_per_partition,
--            dtype=weight_dtype),
-+            dtype=weight_dtype,
-+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
-                                       input_dim=1,
-                                       output_dim=0,
-                                       weight_loader=weight_loader)
-@@ -363,12 +367,15 @@ class Fp8LinearMethod(LinearMethodBase):
-                                                requires_grad=False)
- 
-         # If checkpoint not serialized fp8, quantize the weights.
--        elif not self.quant_config.is_checkpoint_fp8_serialized:
-+        if not self.quant_config.is_checkpoint_fp8_serialized:
-             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
-                                                          scale=None)
- 
-             # Update the layer with the new values.
--            layer.weight = Parameter(qweight.t(), requires_grad=False)
-+            if current_platform.is_xpu():
-+                layer.weight = Parameter(qweight, requires_grad=False)
-+            else:
-+                layer.weight = Parameter(qweight.t(), requires_grad=False)
-             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-             layer.input_scale = None
- 
-@@ -434,6 +441,14 @@ class Fp8LinearMethod(LinearMethodBase):
-               layer: torch.nn.Module,
-               x: torch.Tensor,
-               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-+        if not hasattr(layer, "weight_scale"):
-+            return F.linear(x, layer.weight, bias)
-+
-+        if current_platform.is_xpu():
-+            weight = layer.weight.data
-+            scale = layer.weight_scale.data
-+            output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True, scale, bias)
-+            return output
+     def apply(self,
+@@ -249,7 +255,8 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
+             qconfig=qconfig,
+             bias=bias,
+             group_size=self.quant_config.group_size,
+-            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],
++            weight_qscheme="sym" if self.quant_config.is_qweight_sym else "asym",
+         )
  
-         if self.use_marlin:
-             return apply_fp8_marlin_linear(
-@@ -587,8 +602,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+     def apply(self,
+@@ -302,12 +309,12 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
+         layer.num_experts = num_experts
+         layer.orig_dtype = params_dtype
+         layer.weight_block_size = None
+-        # WEIGHTS
+         w13_weight = torch.nn.Parameter(torch.empty(
              num_experts,
              2 * intermediate_size_per_partition,
              hidden_size,
@@ -12326,11 +9657,9 @@ index 75f8adf34..73f8488dc 100644
 +            dtype=params_dtype,
 +            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
                                          requires_grad=False)
-+
          layer.register_parameter("w13_weight", w13_weight)
          set_weight_attrs(w13_weight, extra_weight_attrs)
- 
-@@ -596,7 +613,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
+@@ -316,7 +323,8 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
              num_experts,
              hidden_size,
              intermediate_size_per_partition,
@@ -12340,258 +9669,116 @@ index 75f8adf34..73f8488dc 100644
                                         requires_grad=False)
          layer.register_parameter("w2_weight", w2_weight)
          set_weight_attrs(w2_weight, extra_weight_attrs)
-@@ -768,6 +786,23 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-                                                       requires_grad=False)
-                 layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                      requires_grad=False)
-+            
+diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
+index f935bdd84..9a80b80e7 100644
+--- a/vllm/model_executor/layers/quantization/mxfp4.py
++++ b/vllm/model_executor/layers/quantization/mxfp4.py
+@@ -95,6 +95,9 @@ def get_mxfp4_backend():
+         else:
+             logger.info_once("Using Triton backend")
+             return Mxfp4Backend.TRITON
++    elif current_platform.is_xpu():
++        logger.info_once("Using ipex marlin backend on XPU")
++        return Mxfp4Backend.MARLIN
+     elif current_platform.is_rocm() and has_triton_kernels():
+         logger.info_once("Using Triton backend")
+         return Mxfp4Backend.TRITON
+@@ -140,7 +143,10 @@ class Mxfp4Config(QuantizationConfig):
+                 return UnquantizedLinearMethod()
+             raise NotImplementedError("Mxfp4 linear layer is not implemented")
+         elif isinstance(layer, FusedMoE):
+-            return Mxfp4MoEMethod(layer.moe_config)
 +            if current_platform.is_xpu():
-+                import intel_extension_for_pytorch as ipex
-+                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-+                    layer.w13_weight,
-+                    layer.w2_weight,
-+                    w1_scale_inv=(layer.w13_weight_scale_inv
-+                        if self.block_quant else layer.w13_weight_scale),
-+                    w2_scale_inv=(layer.w2_weight_scale_inv
-+                        if self.block_quant else layer.w2_weight_scale),
-+                    a1_scale_inv=layer.w13_input_scale,
-+                    a2_scale_inv=layer.w2_input_scale,
-+                    use_prepack=True,
-+                )
-+
-+            return
-+
-         # If checkpoint is fp8, we need to handle that the
-         # MoE kernels require single activation scale and single weight
-         # scale for w13 per expert.
-@@ -936,6 +971,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-         logical_to_physical_map: Optional[torch.Tensor] = None,
-         logical_replica_count: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-+        if current_platform.is_xpu():
-+            return self.forward_xpu(
-+                x=x,
-+                layer=layer,
-+                router_logits=router_logits,
-+                top_k=top_k,
-+                renormalize=renormalize,
-+                use_grouped_topk=use_grouped_topk,
-+                topk_group=topk_group,
-+                num_expert_group=num_expert_group,
-+                global_num_experts=global_num_experts,
-+                expert_map=expert_map,
-+                custom_routing_function=custom_routing_function,
-+                scoring_func=scoring_func,
-+                e_score_correction_bias=e_score_correction_bias,
-+                activation=activation,
-+                apply_router_weight_on_input=apply_router_weight_on_input)
-+
-         if enable_eplb:
-             assert expert_load_view is not None
-             assert logical_to_physical_map is not None
-@@ -1042,6 +1095,30 @@ class Fp8MoEMethod(FusedMoEMethodBase):
-                 a2_scale=layer.w2_input_scale,
-             )
- 
-+    def forward_xpu(
-+            self,
-+            layer: torch.nn.Module,
-+            x: torch.Tensor,
-+            use_grouped_topk: bool,
-+            top_k: int,
-+            router_logits: torch.Tensor,
-+            renormalize: bool,
-+            topk_group: Optional[int] = None,
-+            num_expert_group: Optional[int] = None,
-+            custom_routing_function: Optional[Callable] = None,
-+            **kwargs,
-+    ):
-+
-+        return layer.ipex_fusion(
-+            x,
-+            use_grouped_topk,
-+            top_k,
-+            router_logits,
-+            renormalize,
-+            topk_group,
-+            num_expert_group,
-+            custom_routing_function=custom_routing_function,
-+        )
- 
- class Fp8KVCacheMethod(BaseKVCacheMethod):
-     """
-diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
-index 428e9b882..b778b4ae0 100644
---- a/vllm/model_executor/layers/quantization/ipex_quant.py
-+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
-@@ -4,6 +4,7 @@
- from typing import Any, Optional
- 
- import torch
-+import time
- 
- from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                                UnquantizedLinearMethod)
-@@ -13,11 +14,18 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
- from vllm.model_executor.layers.quantization.base_config import (
-     QuantizationConfig)
- from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-+from vllm.model_executor.parameter import (ModelWeightParameter,
-+                                           PerTensorScaleParameter)
-+from vllm.model_executor.utils import set_weight_attrs
- from vllm.platforms import current_platform
- 
--MIN_IPEX_VERSION = "2.6.0"
-+from vllm.model_executor.layers.quantization import register_quantization_config
-+from vllm.model_executor.layers.quantization import get_quantization_config
- 
- 
-+MIN_IPEX_VERSION = "2.7.0"
-+ACTIVATION_SCHEMES = ["static", "dynamic"]
-+
- class IPEXConfig(QuantizationConfig):
-     """INT8 quantization config class using IPEX for the CPU/XPU backend,
-     including AWQ, GPTQ.
-@@ -36,6 +44,7 @@ class IPEXConfig(QuantizationConfig):
-         modules_to_not_convert: Optional[list[str]] = None,
-         desc_act: Optional[bool] = None,
-         lm_head_quantized: Optional[bool] = None,
-+        is_checkpoint_fp8_serialized: bool = False,
-     ) -> None:
-         super().__init__()
-         self.method = method
-@@ -45,14 +54,15 @@ class IPEXConfig(QuantizationConfig):
-         self.desc_act = desc_act
-         self.lm_head_quantized = lm_head_quantized
-         self.pack_factor = 32 // self.weight_bits
--
--        if self.weight_bits not in [4]:
--            raise ValueError(f"IPEX quantization supports weight bits [4], "
--                             f"but got {self.weight_bits}.")
--
--        if self.method not in ["awq", "gptq"]:
--            raise ValueError(f"IPEX quantization supports [awq, gptq], "
-+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-+        if self.method not in ["awq", "gptq", "auto-round", "fp8"]:
-+            raise ValueError(f"IPEX quantization supports [awq, gptq, auto-round, fp8], "
-                              f"but got {self.method}.")
-+        if is_checkpoint_fp8_serialized:
-+            self.quant_method = "fp8"
-+            print("Detected fp8 checkpoint. Please note that the "
-+                   "format is experimental and subject to change.")
-+        self.activation_scheme = "dynamic"
- 
-     def __repr__(self) -> str:
-         return (f"IPEXConfig(method={self.method},"
-@@ -94,9 +104,13 @@ class IPEXConfig(QuantizationConfig):
-         group_size = cls.get_from_keys(config, ["group_size"])
-         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                  default=False)
-+        data_type = cls.get_from_keys_or(config, ["data_type"],
-+                                      default="int4")
-+        is_checkpoint_fp8_serialized = ("fp8" in data_type)
-+
-         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-         return cls(method, weight_bits, group_size, [], desc_act,
--                   lm_head_quantized)
-+                   lm_head_quantized, is_checkpoint_fp8_serialized)
- 
-     @classmethod
-     def override_quantization_method(
-@@ -106,7 +120,7 @@ class IPEXConfig(QuantizationConfig):
- 
-         quant_method = hf_quant_cfg.get("quant_method", "").lower()
- 
--        if quant_method in ["awq", "gptq"]:
-+        if quant_method in ["awq", "gptq", "auto-round", "fp8"]:
-             return cls.get_name()
- 
-         return None
-@@ -120,8 +134,84 @@ class IPEXConfig(QuantizationConfig):
-                 return IPEXAWQLinearMethod(self)
-             if self.method == "gptq":
-                 return IPEXGPTQLinearMethod(self)
-+            if self.method == "auto-round" or self.method == "fp8":
-+                return IPEXAutoRoundLinearMethod(self)
-         return None
++                return IpexFp4MoeMethod(layer.moe_config)
++            else:
++                return Mxfp4MoEMethod(layer.moe_config)
+         elif isinstance(layer, Attention):
+             raise NotImplementedError(
+                 "Mxfp4 attention layer is not implemented")
+@@ -165,6 +171,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+     def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                        hidden_size: int, intermediate_size_per_partition: int,
+                        params_dtype: torch.dtype, **extra_weight_attrs):
++        self.original_hidden_size = hidden_size
+         self.num_experts = num_experts
+         weight_dtype = torch.uint8
+         scale_dtype = torch.uint8
+@@ -192,7 +199,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+             #    k = intermediate_size_per_partition_after_pad
+             intermediate_size_per_partition_after_pad = round_up(
+                 intermediate_size_per_partition, 128)
+-            hidden_size = round_up(hidden_size, 256)
++            if current_platform.is_xpu(): 
++                hidden_size = round_up(hidden_size, 128) 
++            else:
++                hidden_size = round_up(hidden_size, 256) 
  
-+class IPEXAutoRoundLinearMethod(LinearMethodBase):
-+    def __init__(self, quant_config: IPEXConfig):
-+        self.quant_config = quant_config
-+        self.out_dtype = torch.get_default_dtype()
-+
-+    def create_weights(
-+            self,
-+            layer: torch.nn.Module,
-+            input_size_per_partition: int,
-+            output_partition_sizes: list[int],
-+            input_size: int,
-+            output_size: int,
-+            params_dtype: torch.dtype,
-+            **extra_weight_attrs,
-+    ):
-+        # maybe_create_device_identity()
-+
-+        output_size_per_partition = sum(output_partition_sizes)
-+        weight_loader = extra_weight_attrs.get("weight_loader")
-+        layer.logical_widths = output_partition_sizes
+             layer.params_dtype = params_dtype
+             layer.num_experts = num_experts
+@@ -949,3 +959,63 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+             )
+         else:
+             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 +
-+        layer.input_size_per_partition = input_size_per_partition
-+        layer.output_size_per_partition = output_size_per_partition
-+        layer.orig_dtype = params_dtype
-+        # WEIGHT
-+        weight_dtype = (torch.float8_e5m2
-+                        if self.quant_config.is_checkpoint_fp8_serialized else
-+                        params_dtype)
 +
-+        weight = ModelWeightParameter(data=torch.empty(
-+            output_size_per_partition,
-+            input_size_per_partition,
-+            dtype=weight_dtype),
-+            input_dim=1,
-+            output_dim=0,
-+            weight_loader=weight_loader)
-+        layer.register_parameter("weight", weight)
++class IpexFp4MoeMethod(Mxfp4MoEMethod):
 +
-+        # If checkpoint is serialized fp8, load them.
-+        # Otherwise, wait until process_weights_after_loading.
-+        if self.quant_config.is_checkpoint_fp8_serialized:
-+            # WEIGHT SCALE
-+            scale = PerTensorScaleParameter(
-+                data=torch.empty(len(output_partition_sizes),
-+                                 dtype=torch.float32),
-+                weight_loader=weight_loader,
-+            )
-+            scale[:] = torch.finfo(torch.float32).min
-+            set_weight_attrs(scale, {"scale_type": "weight_scale"})
-+            set_weight_attrs(scale, {"needs_scalar_to_array": True})
-+            layer.register_parameter("weight_scale", scale)
-+            # INPUT ACTIVATION SCALE
-+            if self.quant_config.activation_scheme == "static":
-+                scale = PerTensorScaleParameter(data=torch.empty(
-+                    1, dtype=torch.float32),
-+                    weight_loader=weight_loader)
-+
-+                scale[:] = torch.finfo(torch.float32).min
-+                set_weight_attrs(scale, {"scale_type": "input_scale"})
-+                layer.register_parameter("input_scale", scale)
-+            else:
-+                layer.register_parameter("input_scale", None)
++    def __init__(self, moe_config: FusedMoEConfig):
++        super().__init__(moe_config)
++        self.moe_config = moe_config
++        self.alpha = 1.702
++        self.limit = 7.0
 +
 +    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-+        pass
++        import intel_extension_for_pytorch as ipex
++        layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
++        layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
++        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
++            layer.w13_weight,
++            layer.w2_weight,
++            w1_scale_inv=layer.w13_weight_scale,
++            w2_scale_inv=layer.w2_weight_scale,
++            w13_bias=layer.w13_bias,
++            w2_bias=layer.w2_bias,
++            is_mxfp4=True,
++        )
 +
-+    def apply(self,
-+              layer: torch.nn.Module,
-+              x: torch.Tensor,
-+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-+        weight = layer.weight.data
-+        scale = layer.weight_scale.data
-+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True, scale, bias)
-+        return output
- 
- class IPEXGPTQLinearMethod(GPTQLinearMethod):
-     """GPTQ linear method using IPEX for the CPU/XPU backend.
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        global_num_experts: int = -1,
++        expert_map: Optional[torch.Tensor] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        routed_scaling_factor: float = 1.0,
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++        apply_router_weight_on_input: bool = False,
++        activation: str = "silu",
++        enable_eplb: bool = False,
++        expert_load_view: Optional[torch.Tensor] = None,
++        logical_to_physical_map: Optional[torch.Tensor] = None,
++        logical_replica_count: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        hidden_size_pad = round_up(self.original_hidden_size, 128)
++        x_pad = torch.nn.functional.pad(
++            x, (0, hidden_size_pad - x.size(-1)))
++        hidden_states = layer.ipex_fusion(x_pad,
++                                          use_grouped_topk,
++                                          top_k,
++                                          router_logits,
++                                          renormalize,
++                                          topk_group,
++                                          num_expert_group,
++                                          activation="swiglu_oai")
++        hidden_states = hidden_states[..., :self.original_hidden_size].contiguous()
++        return hidden_states
 diff --git a/vllm/model_executor/layers/quantization/sym_int4.py b/vllm/model_executor/layers/quantization/sym_int4.py
 new file mode 100644
 index 000000000..10d1d3d56
@@ -12821,59 +10008,802 @@ index 000000000..10d1d3d56
 +            # For GPTQ layout
 +            quant_method=0
 +        )
-diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
-index dddd4d6a7..324645157 100644
---- a/vllm/model_executor/layers/rotary_embedding.py
-+++ b/vllm/model_executor/layers/rotary_embedding.py
-@@ -1003,6 +1003,8 @@ class MRotaryEmbedding(RotaryEmbedding):
+diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
+index 564f9a5c0..c9653aa9e 100644
+--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
++++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
+@@ -103,6 +103,8 @@ def get_rope(
+                     is_neox_style,
+                     dtype,
+                     mrope_section=rope_scaling["mrope_section"],
++                    mrope_interleaved=rope_scaling.get("mrope_interleaved",
++                                                       False),
+                 )
+             else:
+                 rotary_emb = RotaryEmbedding(
+diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+index 7ac2e4bb6..450d0cee1 100644
+--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
++++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+@@ -138,3 +138,12 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+         offsets: Optional[torch.Tensor] = None,
+     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+         return self.forward_native(positions, query, key, offsets)
++
++    def forward_xpu(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: Optional[torch.Tensor] = None,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
++        return self.forward_native(positions, query, key, offsets)
+diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
+index 0acb5ea74..c4b8c66eb 100644
+--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
++++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
+@@ -177,6 +177,18 @@ def triton_mrope(
+     return q, k
+ 
+ 
++def apply_interleaved_rope(x: torch.Tensor,
++                           mrope_section: list[int]) -> torch.Tensor:
++    """Apply interleaved MRoPE to 3D rotary embeddings.
++    Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
++    interleaved [THTHWHTHW...TT], preserving frequency continuity.
++    """
++    x_t = x[0].clone()
++    x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3]
++    x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3]
++    return x_t
++
++
+ class MRotaryEmbedding(RotaryEmbedding):
+     """Rotary Embedding with Multimodal Sections."""
+ 
+@@ -189,6 +201,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+         is_neox_style: bool,
+         dtype: torch.dtype,
+         mrope_section: Optional[list[int]] = None,
++        mrope_interleaved: Optional[bool] = False,
+     ) -> None:
+         # In Qwen2.5-VL, the maximum index value is related to the duration of
+         # the input video. We enlarge max_position_embeddings to 4 times to get
+@@ -198,6 +211,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+                          base, is_neox_style, dtype)
+ 
+         self.mrope_section = mrope_section
++        self.mrope_interleaved = mrope_interleaved
+         if self.mrope_section:
+             assert sum(self.mrope_section) == rotary_dim // 2
+ 
+@@ -225,17 +239,20 @@ class MRotaryEmbedding(RotaryEmbedding):
+         cos, sin = cos_sin.chunk(2, dim=-1)
+         if positions.ndim == 2:
+             assert self.mrope_section
+-
+-            cos = torch.cat([
+-                m[i]
+-                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
+-            ],
+-                            dim=-1)
+-            sin = torch.cat([
+-                m[i]
+-                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
+-            ],
+-                            dim=-1)
++            if self.mrope_interleaved:
++                cos = apply_interleaved_rope(cos, self.mrope_section)
++                sin = apply_interleaved_rope(sin, self.mrope_section)
++            else:
++                cos = torch.cat([
++                    m[i] for i, m in enumerate(
++                        cos.split(self.mrope_section, dim=-1))
++                ],
++                                dim=-1)
++                sin = torch.cat([
++                    m[i] for i, m in enumerate(
++                        sin.split(self.mrope_section, dim=-1))
++                ],
++                                dim=-1)
+ 
+         query_shape = query.shape
+         query = query.view(num_tokens, -1, self.head_size)
+@@ -265,6 +282,10 @@ class MRotaryEmbedding(RotaryEmbedding):
          assert positions.ndim == 1 or positions.ndim == 2
          assert key is not None
  
-+        return self.forward_xpu(positions, query, key)
-+        '''
++        if self.mrope_interleaved:
++            # TODO: add triton implementation to support mrope-interleaved
++            return self.forward_native(positions, query, key)
++
          num_tokens = positions.shape[-1]
          cos_sin = self.cos_sin_cache[positions]
          cos, sin = cos_sin.chunk(2, dim=-1)
-@@ -1034,6 +1036,7 @@ class MRotaryEmbedding(RotaryEmbedding):
-         key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+@@ -300,6 +321,15 @@ class MRotaryEmbedding(RotaryEmbedding):
          key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
          return query, key
-+        '''
  
++    def forward_xpu(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: Optional[torch.Tensor] = None,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
++        return self.forward_native(positions, query, key, offsets)
++
      @classmethod
      def get_input_positions(
-diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
-index 4b30336f0..5298ed666 100644
---- a/vllm/model_executor/model_loader/utils.py
-+++ b/vllm/model_executor/model_loader/utils.py
-@@ -16,6 +16,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
- from vllm.attention import Attention
- from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
-                          set_current_vllm_config)
-+from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
+         cls,
+@@ -370,6 +400,15 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 context_len=context_len,
+                 seq_len=seq_len,
+             )
++        elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]:
++            return cls._qwen3vl_get_input_positions_tensor(
++                input_tokens=input_tokens,
++                hf_config=hf_config,
++                image_grid_thw=image_grid_thw,
++                video_grid_thw=video_grid_thw,
++                context_len=context_len,
++                seq_len=seq_len,
++            )
+         elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
+             return cls._ernie_get_input_positions_tensor(
+                 input_tokens=input_tokens,
+@@ -508,6 +547,98 @@ class MRotaryEmbedding(RotaryEmbedding):
+                                 len(input_tokens)).item()
+         return llm_positions, mrope_position_delta
+ 
++    @classmethod
++    def _qwen3vl_get_input_positions_tensor(
++        cls,
++        input_tokens: list[int],
++        hf_config: PretrainedConfig,
++        image_grid_thw: Union[list[list[int]], torch.Tensor],
++        video_grid_thw: Union[list[list[int]], torch.Tensor],
++        context_len: int = 0,
++        seq_len: Optional[int] = None,
++    ) -> tuple[torch.Tensor, int]:
++        """Get mrope input positions and delta value."""
++
++        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw
++                          for _ in range(t)]
++
++        image_token_id = hf_config.image_token_id
++        video_token_id = hf_config.video_token_id
++        vision_start_token_id = hf_config.vision_start_token_id
++        spatial_merge_size = hf_config.vision_config.spatial_merge_size
++
++        input_tokens_tensor = torch.tensor(input_tokens)
++        vision_start_indices = torch.argwhere(
++            input_tokens_tensor == vision_start_token_id).squeeze(1)
++        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
++        image_nums = (vision_tokens == image_token_id).sum()
++        video_nums = (vision_tokens == video_token_id).sum()
++        llm_pos_ids_list: list = []
++
++        st = 0
++        remain_images, remain_videos = image_nums, video_nums
++
++        image_index, video_index = 0, 0
++        for _ in range(image_nums + video_nums):
++            if image_token_id in input_tokens and remain_images > 0:
++                ed_image = input_tokens.index(image_token_id, st)
++            else:
++                ed_image = len(input_tokens) + 1
++            if video_token_id in input_tokens and remain_videos > 0:
++                ed_video = input_tokens.index(video_token_id, st)
++            else:
++                ed_video = len(input_tokens) + 1
++            if ed_image < ed_video:
++                t, h, w = (
++                    image_grid_thw[image_index][0],
++                    image_grid_thw[image_index][1],
++                    image_grid_thw[image_index][2],
++                )
++                image_index += 1
++                remain_images -= 1
++                ed = ed_image
++            else:
++                t, h, w = (
++                    video_grid_thw[video_index][0],
++                    video_grid_thw[video_index][1],
++                    video_grid_thw[video_index][2],
++                )
++                video_index += 1
++                remain_videos -= 1
++                ed = ed_video
++
++            llm_grid_t, llm_grid_h, llm_grid_w = \
++                t, h // spatial_merge_size, w // spatial_merge_size
++            text_len = ed - st
++
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
++                -1, llm_grid_h * llm_grid_w).flatten()
++            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
++                llm_grid_t, -1, llm_grid_w).flatten()
++            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
++                llm_grid_t, llm_grid_h, -1).flatten()
++            llm_pos_ids_list.append(
++                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
++            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
++
++        if st < len(input_tokens):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            text_len = len(input_tokens) - st
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++        mrope_position_delta = (llm_positions.max() + 1 -
++                                len(input_tokens)).item()
++        llm_positions = llm_positions[:, context_len:seq_len]
++        return llm_positions, mrope_position_delta
++
+     @classmethod
+     def _ernie_get_input_positions_tensor(
+         cls,
+@@ -715,15 +846,23 @@ class MRotaryEmbedding(RotaryEmbedding):
+             st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                 llm_pos_ids_list) > 0 else 0
+             llm_pos_ids_list.append(
+-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+-
+-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+-                -1, llm_grid_h * llm_grid_w)).long().flatten()
+-
+-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+-                llm_grid_t, -1, llm_grid_w).flatten()
+-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+-                llm_grid_t, llm_grid_h, -1).flatten()
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
++            )
++            t_index = (
++                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
++            ).flatten()
++            h_index = (
++                torch.arange(llm_grid_h)
++                .view(1, -1, 1)
++                .expand(llm_grid_t, -1, llm_grid_w)
++                .flatten()
++            )
++            w_index = (
++                torch.arange(llm_grid_w)
++                .view(1, 1, -1)
++                .expand(llm_grid_t, llm_grid_h, -1)
++                .flatten()
++            )
+             llm_pos_ids_list.append(
+                 torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+@@ -772,7 +911,6 @@ class MRotaryEmbedding(RotaryEmbedding):
+ 
+         st = 0
+         remain_images, remain_videos = image_nums, video_nums
+-
+         image_index, video_index = 0, 0
+         for _ in range(image_nums + video_nums):
+             video_second_per_grid_t = 0.0
+@@ -819,16 +957,25 @@ class MRotaryEmbedding(RotaryEmbedding):
+             st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                 llm_pos_ids_list) > 0 else 0
+             llm_pos_ids_list.append(
+-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+-
+-            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+-                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+-                       tokens_per_second).long().flatten()
+-
+-            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+-                llm_grid_t, -1, llm_grid_w).flatten()
+-            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+-                llm_grid_t, llm_grid_h, -1).flatten()
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
++            )
++            t_index = (
++                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
++                * video_second_per_grid_t
++                * tokens_per_second
++            ).flatten()
++            h_index = (
++                torch.arange(llm_grid_h)
++                .view(1, -1, 1)
++                .expand(llm_grid_t, -1, llm_grid_w)
++                .flatten()
++            )
++            w_index = (
++                torch.arange(llm_grid_w)
++                .view(1, 1, -1)
++                .expand(llm_grid_t, llm_grid_h, -1)
++                .flatten()
++            )
+             llm_pos_ids_list.append(
+                 torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+@@ -847,6 +994,339 @@ class MRotaryEmbedding(RotaryEmbedding):
+ 
+         return llm_positions, mrope_position_delta
+ 
++    @classmethod
++    def _omni3_get_input_positions_tensor(
++        cls,
++        config,
++        input_ids: torch.Tensor,
++        image_grid_thw: torch.Tensor,
++        video_grid_thw: torch.Tensor,
++        use_audio_in_video: bool = False,
++        audio_seqlens: Optional[torch.Tensor] = None,
++        second_per_grids: Optional[torch.Tensor] = None,
++    ) -> tuple[torch.Tensor, torch.Tensor]:
++        def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
++            input_lengths_leave = input_lengths % 100
++            feat_lengths = (input_lengths_leave - 1) // 2 + 1
++            output_lengths = (
++                ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++            )
++            return output_lengths
++
++        if input_ids is None or input_ids.ndim != 1:
++            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
++
++        seq_len = input_ids.shape[0]
++        device = input_ids.device
++        dtype = input_ids.dtype
++
++        if image_grid_thw is not None:
++            image_grid_thw = image_grid_thw.to(device=device, dtype=torch.long)
++        if video_grid_thw is not None:
++            video_grid_thw = video_grid_thw.to(device=device, dtype=torch.long)
++
++        if second_per_grids is None:
++            if video_grid_thw is not None and video_grid_thw.numel() > 0:
++                second_per_grids = torch.ones(
++                    video_grid_thw.shape[0], dtype=torch.float32, device=device
++                )
++            else:
++                second_per_grids = torch.tensor([], dtype=torch.float32, device=device)
++        else:
++            second_per_grids = second_per_grids.to(device=device, dtype=torch.float32)
++
++        if audio_seqlens is not None:
++            audio_seqlens = audio_seqlens.to(device=device, dtype=torch.long)
++
++        spatial_merge_size = config.vision_config.spatial_merge_size
++        image_token_id = config.image_token_id
++        video_token_id = config.video_token_id
++        audio_token_id = config.audio_token_id
++        vision_start_token_id = config.vision_start_token_id
++        audio_start_token_id = config.audio_start_token_id
++        position_id_per_seconds = config.position_id_per_seconds
++
++        vision_start_indices = torch.argwhere(
++            input_ids == vision_start_token_id
++        ).squeeze(1)
++        if vision_start_indices.numel() > 0:
++            vision_tokens = input_ids[vision_start_indices + 1]
++        else:
++            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
++        audio_nums = torch.sum(input_ids == audio_start_token_id)
++        image_nums = (vision_tokens == image_token_id).sum()
++        video_nums = (
++            (vision_tokens == audio_start_token_id).sum()
++            if use_audio_in_video
++            else (vision_tokens == video_token_id).sum()
++        )
++
++        input_tokens = input_ids.tolist()
++        llm_pos_ids_list: list[torch.Tensor] = []
++        st = 0
++        image_idx = 0
++        video_idx = 0
++        audio_idx = 0
++        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
++        multimodal_nums = (
++            image_nums + audio_nums
++            if use_audio_in_video
++            else image_nums + video_nums + audio_nums
++        )  # noqa: E501
++
++        for _ in range(multimodal_nums):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
++                remain_videos > 0 or remain_images > 0
++            ):
++                ed_vision_start = input_tokens.index(vision_start_token_id, st)
++            else:
++                ed_vision_start = len(input_tokens) + 1
++            if audio_token_id in input_tokens and remain_audios > 0:
++                ed_audio_start = input_tokens.index(audio_start_token_id, st)
++            else:
++                ed_audio_start = len(input_tokens) + 1
++            min_ed = min(ed_vision_start, ed_audio_start)
++
++            if min_ed == ed_audio_start:
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
++                llm_pos_ids = (
++                    torch.arange(audio_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + audio_len + eos_len
++                audio_idx += 1
++                remain_audios -= 1
++            elif (
++                min_ed == ed_vision_start
++                and input_ids[ed_vision_start + 1] == image_token_id
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                grid_t = image_grid_thw[image_idx][0]
++                grid_hs = image_grid_thw[:, 1]
++                grid_ws = image_grid_thw[:, 2]
++                t_index = torch.arange(grid_t, device=device) * position_id_per_seconds
++                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + image_len + eos_len
++                image_idx += 1
++                remain_images -= 1
++            elif (
++                min_ed == ed_vision_start
++                and input_ids[ed_vision_start + 1] == video_token_id
++                and not use_audio_in_video
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                grid_t = video_grid_thw[video_idx][0]
++                grid_hs = video_grid_thw[:, 1]
++                grid_ws = video_grid_thw[:, 2]
++                t_index = (
++                    torch.arange(grid_t, device=device)
++                    * float(second_per_grids[video_idx].item())
++                    * position_id_per_seconds
++                )
++                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
++                llm_pos_ids_list.append(llm_pos_ids)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                llm_pos_ids_list.append(
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                st += text_len + bos_len + video_len + eos_len
++                video_idx += 1
++                remain_videos -= 1
++            elif (
++                min_ed == ed_vision_start
++                and ed_vision_start + 1 == ed_audio_start
++                and use_audio_in_video
++            ):
++                text_len = min_ed - st
++                if text_len != 0:
++                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                    llm_pos_ids_list.append(
++                        torch.arange(text_len, device=device, dtype=torch.long)
++                        .view(1, -1)
++                        .expand(3, -1)
++                        + st_idx
++                    )
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                bos_len = 1
++                bos_block = (
++                    torch.arange(bos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(bos_block)
++                llm_pos_ids_list.append(bos_block)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
++                audio_llm_pos_ids = (
++                    torch.arange(audio_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                grid_t = video_grid_thw[video_idx][0]
++                grid_hs = video_grid_thw[:, 1]
++                grid_ws = video_grid_thw[:, 2]
++                t_index = (
++                    torch.arange(grid_t, device=device)
++                    * float(second_per_grids[video_idx].item())
++                    * position_id_per_seconds
++                )
++                video_llm_pos_ids = cls._get_llm_pos_ids_for_vision(
++                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
++                )
++                video_data_index, audio_data_index = 0, 0
++                while (
++                    video_data_index < video_llm_pos_ids.shape[-1]
++                    and audio_data_index < audio_llm_pos_ids.shape[-1]
++                ):
++                    if (
++                        video_llm_pos_ids[0][video_data_index]
++                        <= audio_llm_pos_ids[0][audio_data_index]
++                    ):
++                        llm_pos_ids_list.append(
++                            video_llm_pos_ids[
++                                :, video_data_index : video_data_index + 1
++                            ]
++                        )
++                        video_data_index += 1
++                    else:
++                        llm_pos_ids_list.append(
++                            audio_llm_pos_ids[
++                                :, audio_data_index : audio_data_index + 1
++                            ]
++                        )
++                        audio_data_index += 1
++                if video_data_index < video_llm_pos_ids.shape[-1]:
++                    llm_pos_ids_list.append(
++                        video_llm_pos_ids[
++                            :, video_data_index : video_llm_pos_ids.shape[-1]
++                        ]
++                    )
++                if audio_data_index < audio_llm_pos_ids.shape[-1]:
++                    llm_pos_ids_list.append(
++                        audio_llm_pos_ids[
++                            :, audio_data_index : audio_llm_pos_ids.shape[-1]
++                        ]
++                    )
++                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
++                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++                eos_len = 1
++                eos_block = (
++                    torch.arange(eos_len, device=device, dtype=torch.long)
++                    .view(1, -1)
++                    .expand(3, -1)
++                    + st_idx
++                )
++                llm_pos_ids_list.append(eos_block)
++                llm_pos_ids_list.append(eos_block)
++                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
++                audio_idx += 1
++                video_idx += 1
++                remain_videos -= 1
++                remain_audios -= 1
++
++        if st < len(input_tokens):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
++            text_len = len(input_tokens) - st
++            llm_pos_ids_list.append(
++                torch.arange(text_len, device=device, dtype=torch.long)
++                .view(1, -1)
++                .expand(3, -1)
++                + st_idx
++            )
++
++        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++        if llm_positions.shape[1] != seq_len:
++            raise RuntimeError("Position ids length mismatch with input ids length")
++
++        position_ids = llm_positions.to(device=device, dtype=dtype)
++        mrope_position_delta = llm_positions.max() + 1 - seq_len
++        return position_ids, mrope_position_delta
++
+     @classmethod
+     def _omni_get_input_positions_tensor(
+         cls,
+@@ -879,7 +1359,38 @@ class MRotaryEmbedding(RotaryEmbedding):
+         # TODO(fyabc): refactor and share more code with
+         #  _vl_get_input_positions_tensor.
+ 
++        model_type = hf_config.model_type
+         thinker_config = hf_config.thinker_config
++
++        if isinstance(image_grid_thw, list):
++            image_grid_thw = torch.tensor(image_grid_thw)
++        if isinstance(video_grid_thw, list):
++            video_grid_thw = torch.tensor(video_grid_thw)
++
++        if "qwen3_omni" in model_type:
++            input_tensor = torch.tensor(input_tokens)
++            audio_lengths_tensor = audio_feature_lengths
++            if audio_lengths_tensor is not None and not isinstance(
++                audio_lengths_tensor, torch.Tensor
++            ):
++                audio_lengths_tensor = torch.as_tensor(
++                    audio_lengths_tensor, dtype=torch.long
++                )
++            second_per_grids_tensor = (
++                torch.tensor(second_per_grid_ts) if second_per_grid_ts else None
++            )
++
++            llm_positions, mrope_position_delta = cls._omni3_get_input_positions_tensor(  # noqa: E501
++                thinker_config,
++                input_tensor,
++                image_grid_thw,
++                video_grid_thw,
++                use_audio_in_video,
++                audio_lengths_tensor,
++                second_per_grids_tensor,
++            )
++            return llm_positions, mrope_position_delta
++
+         audio_token_id = thinker_config.audio_token_index
+         image_token_id = thinker_config.image_token_index
+         video_token_id = thinker_config.video_token_index
+@@ -892,11 +1403,6 @@ class MRotaryEmbedding(RotaryEmbedding):
+         tokens_per_second = getattr(thinker_config.vision_config,
+                                     "tokens_per_second", 25)
+ 
+-        if isinstance(image_grid_thw, list):
+-            image_grid_thw = torch.tensor(image_grid_thw)
+-        if isinstance(video_grid_thw, list):
+-            video_grid_thw = torch.tensor(video_grid_thw)
+-
+         src_item = input_tokens
+         audio_seqlens = audio_feature_lengths
+         if not second_per_grid_ts:
+@@ -940,7 +1446,7 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_t = image_grid_thw[image_idx][0]
+                 grid_hs = image_grid_thw[:, 1]
+                 grid_ws = image_grid_thw[:, 2]
+-                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
++                t_index = torch.arange(grid_t) * 1 * tokens_per_second
+                 llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                     start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                     grid_ws)
+@@ -953,9 +1459,11 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_t = video_grid_thw[video_idx][0]
+                 grid_hs = video_grid_thw[:, 1]
+                 grid_ws = video_grid_thw[:, 2]
+-                t_index = (torch.arange(grid_t) *
+-                           second_per_grid_ts[video_idx] *
+-                           tokens_per_second).long()
++                t_index = (
++                    torch.arange(grid_t)
++                    * second_per_grid_ts[video_idx]
++                    * tokens_per_second
++                )
+                 llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                     start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                     grid_ws)
+@@ -976,9 +1484,11 @@ class MRotaryEmbedding(RotaryEmbedding):
+                 grid_hs = video_grid_thw[:, 1]
+                 grid_ws = video_grid_thw[:, 2]
+                 t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+-                t_index = (torch.arange(grid_t) *
+-                           second_per_grid_ts[video_idx] *
+-                           tokens_per_second).long()
++                t_index = (
++                    torch.arange(grid_t)
++                    * second_per_grid_ts[video_idx]
++                    * tokens_per_second
++                )
+                 t_index_split_chunk = cls._split_list_into_ranges(
+                     t_index, t_ntoken_per_chunk)
+                 place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+@@ -1117,10 +1627,8 @@ class MRotaryEmbedding(RotaryEmbedding):
+         grid_h = video_grid_thw[1]
+         grid_w = video_grid_thw[2]
+         t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+-        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+-                   tokens_per_second).long()
+-        t_index_split_chunk = cls._split_list_into_ranges(
+-            t_index, t_ntoken_per_chunk)
++        t_index = torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second
++        t_index_split_chunk = cls._split_list_into_ranges(t_index, t_ntoken_per_chunk)
+ 
+         updates = [audio_start_token_id]
+         added_audio_len = 0
+diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
+index 0c2441a6d..2e7032094 100644
+--- a/vllm/model_executor/model_loader/utils.py
++++ b/vllm/model_executor/model_loader/utils.py
+@@ -15,6 +15,7 @@ from typing_extensions import assert_never
+ from vllm.attention import Attention
+ from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
+                          set_current_vllm_config)
++from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
  from vllm.logger import init_logger
  from vllm.model_executor.layers.linear import QKVCrossParallelLinear
  from vllm.model_executor.layers.quantization.base_config import (
-@@ -28,6 +29,7 @@ from vllm.model_executor.models.interfaces import SupportsQuant
- from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                  _TRANSFORMERS_MODELS)
+@@ -25,6 +26,7 @@ from vllm.model_executor.models.adapters import (
+ from vllm.model_executor.models.interfaces import (SupportsQuant,
+                                                    supports_multimodal)
  from vllm.utils import is_pin_memory_available
 +from vllm.model_executor.layers.quantization.sym_int4 import SymInt4LinearMethod
  
  logger = init_logger(__name__)
  
-@@ -99,7 +101,9 @@ def initialize_model(
+@@ -96,7 +98,10 @@ def initialize_model(
  
  def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
                                    target_device: torch.device) -> None:
 -    for _, module in model.named_modules():
 +    # gc: Any changes here need to be added to SymInt4Config.get_quant_method
++    # gc: Any changes here need to be added to Fp8Config.get_quant_method
 +    modules_to_not_convert=["visual", "vision", "vpm", "resampler"]
 +    for name, module in model.named_modules():
          if isinstance(module, QKVCrossParallelLinear):
              # NOTE(Isotr0py): special case for cross QKV layer because
              # q and kv proj aren't registered as submodules intentionally
-@@ -107,12 +111,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+@@ -104,12 +109,18 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
              continue
          quant_method = getattr(module, "quant_method", None)
          if isinstance(quant_method, QuantizeMethodBase):
@@ -12898,7 +10828,7 @@ index 4b30336f0..5298ed666 100644
                  quant_method.process_weights_after_loading(module)
  
      # Currently only used by MLA.
-@@ -128,7 +138,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+@@ -125,7 +136,8 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
  
  @contextmanager
  def device_loading_context(module: torch.nn.Module,
@@ -12908,7 +10838,7 @@ index 4b30336f0..5298ed666 100644
      if target_device.type == "cpu":
          # If target is CPU, no need to move anything
          yield module
-@@ -137,36 +148,41 @@ def device_loading_context(module: torch.nn.Module,
+@@ -134,36 +146,41 @@ def device_loading_context(module: torch.nn.Module,
      original_device_states: dict[str, torch.device] = {}
  
      # Store original device states and move parameters to GPU if they're on CPU
@@ -12974,1178 +10904,821 @@ index 4b30336f0..5298ed666 100644
 +            # New parameters or parameters already on target device are untouched # noqa: E501
  
  
- def resolve_transformers_arch(model_config: ModelConfig,
-diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
-index 9dc6115f8..37c3a79cb 100644
---- a/vllm/model_executor/models/bert.py
-+++ b/vllm/model_executor/models/bert.py
-@@ -12,7 +12,6 @@ from vllm.attention import Attention, AttentionType
- from vllm.compilation.decorators import support_torch_compile
- from vllm.config import CacheConfig, PoolerConfig, VllmConfig
- from vllm.distributed import get_tensor_model_parallel_world_size
--from vllm.forward_context import get_forward_context
- from vllm.model_executor.layers.activation import get_act_fn
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                                QKVParallelLinear,
-@@ -60,7 +59,6 @@ class BertEmbedding(nn.Module):
-     def forward(
-         self,
-         input_ids: torch.Tensor,
--        seq_lens: torch.Tensor,
-         position_ids: torch.Tensor,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-@@ -119,7 +117,6 @@ class BertPooler(Pooler):
-         return pooled_output
- 
- 
--@support_torch_compile
- class BertEncoder(nn.Module):
- 
-     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-@@ -337,6 +334,7 @@ class BertOutput(nn.Module):
-         return hidden_states
- 
- 
-+@support_torch_compile
- class BertModel(nn.Module, SupportsQuant):
- 
-     is_pooling_model = True
-@@ -368,13 +366,9 @@ class BertModel(nn.Module, SupportsQuant):
-         if inputs_embeds is not None:
-             hidden_states = inputs_embeds
-         else:
--            attn_metadata = get_forward_context().attn_metadata
--            assert hasattr(attn_metadata, "seq_lens_tensor")
--            hidden_states = self.embeddings(
--                input_ids=input_ids,
--                seq_lens=attn_metadata.seq_lens_tensor,
--                position_ids=position_ids,
--                token_type_ids=token_type_ids)
-+            hidden_states = self.embeddings(input_ids=input_ids,
-+                                            position_ids=position_ids,
-+                                            token_type_ids=token_type_ids)
-         return self.encoder(hidden_states)
- 
-     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-@@ -447,7 +441,7 @@ class BertPoolingModel(BertModel):
-         return loaded_params
- 
- 
--class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
-+class BertEmbeddingModel(nn.Module, SupportsQuant):
-     """A model that uses Bert to provide embedding functionalities.
- 
-     This class encapsulates the BertModel and provides an interface for
-@@ -474,11 +468,13 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
-         self,
-         input_ids: Optional[torch.Tensor],
-         positions: torch.Tensor,
-+        token_type_ids: Optional[torch.Tensor] = None,
-         intermediate_tensors: Optional[IntermediateTensors] = None,
-         inputs_embeds: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-         return self.model(input_ids=input_ids,
-                           position_ids=positions,
-+                          token_type_ids=token_type_ids,
-                           inputs_embeds=inputs_embeds,
-                           intermediate_tensors=intermediate_tensors)
- 
-diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
-index 8beefb2cd..54f8c0a65 100644
---- a/vllm/model_executor/models/gemma2.py
-+++ b/vllm/model_executor/models/gemma2.py
-@@ -147,10 +147,7 @@ class Gemma2Attention(nn.Module):
-         # reference:
-         # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
-         layer_idx = extract_layer_index(prefix)
--        use_sliding_window = (layer_idx % 2 == 0 and getattr(
--            config, "interleaved_sliding_window", None) is not None)
--        sliding_window = config.interleaved_sliding_window if \
--            use_sliding_window else None
-+        sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
-         self.attn = Attention(self.num_heads,
-                               self.head_dim,
-                               self.scaling,
-diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
-index d14f5fa3d..07ad2bce4 100644
---- a/vllm/model_executor/models/gemma3_mm.py
-+++ b/vllm/model_executor/models/gemma3_mm.py
-@@ -498,8 +498,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-         self.config = config
-         self.quant_config = quant_config
-         self.multimodal_config = multimodal_config
--        self.sliding_window = getattr(config.text_config,
--                                      "interleaved_sliding_window", None)
-+        if hasattr(config, "sliding_window"):
-+            self.sliding_window = getattr(config.text_config,
-+                                    "sliding_window", None)
-+        else:
-+            self.sliding_window = getattr(config.text_config,
-+                                    "interleaved_sliding_window", None)
+ def get_model_architecture(
+diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
+index e4a21febc..eec5b3ed6 100644
+--- a/vllm/model_executor/models/deepseek_v2.py
++++ b/vllm/model_executor/models/deepseek_v2.py
+@@ -448,7 +448,7 @@ class DeepseekV2Attention(nn.Module):
+         kv_a, _ = latent_cache.split(
+             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+         latent_cache = latent_cache.unsqueeze(1)
+-        kv_a = self.kv_a_layernorm(kv_a)
++        kv_a = self.kv_a_layernorm(kv_a, origin=True)
+         kv = self.kv_b_proj(kv_a)[0]
+         kv = kv.view(-1, self.num_local_heads,
+                      self.qk_nope_head_dim + self.v_head_dim)
+diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
+new file mode 100644
+index 000000000..f24cb6d52
+--- /dev/null
++++ b/vllm/model_executor/models/dots_ocr.py
+@@ -0,0 +1,861 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++from collections.abc import Iterable, Mapping
++from typing import Literal, Optional, TypedDict, Union
 +
- 
-         self.vision_tower = SiglipVisionModel(config.vision_config,
-                                               quant_config,
-diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
-index 0996bcf60..c06e9853f 100644
---- a/vllm/model_executor/models/glm4_1v.py
-+++ b/vllm/model_executor/models/glm4_1v.py
-@@ -259,7 +259,8 @@ class Glm4vVisionAttention(nn.Module):
-         )
- 
-         # Detect attention implementation.
--        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        # self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
-         if self.attn_backend not in {
-                 _Backend.FLASH_ATTN,
-                 _Backend.TORCH_SDPA,
-@@ -343,22 +344,51 @@ class Glm4vVisionAttention(nn.Module):
-                                       b=batch_size)
-         elif self.attn_backend == _Backend.TORCH_SDPA:
-             # Execute attention entry by entry for speed & less VRAM.
--            outputs = []
--            for i in range(1, len(cu_seqlens)):
--                start_idx = cu_seqlens[i - 1]
--                end_idx = cu_seqlens[i]
--                q_i = q[:, start_idx:end_idx]
--                k_i = k[:, start_idx:end_idx]
--                v_i = v[:, start_idx:end_idx]
--                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
--                                 for x in [q_i, k_i, v_i])
--                output_i = F.scaled_dot_product_attention(q_i,
--                                                          k_i,
--                                                          v_i,
--                                                          dropout_p=0.0)
--                output_i = rearrange(output_i, "b h s d -> b s h d ")
--                outputs.append(output_i)
--            context_layer = torch.cat(outputs, dim=1)
-+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-+            from vllm._ipex_ops import ipex_ops
-+            output = torch.empty(
-+                        (q.shape[0], q.shape[1], q.shape[2]),
-+                        dtype=q.dtype,
-+                        device=q.device)
-+            import math
-+            head_dim = q.shape[-1]
-+            scale = 1 / math.sqrt(head_dim)
-+            ipex_ops.varlen_attention(q, k, v, output,
-+                                    cu_seqlens,
-+                                    cu_seqlens,
-+                                    None,
-+                                    max_seqlen,
-+                                    max_seqlen,
-+                                    pdropout=0,
-+                                    softmax_scale=scale,
-+                                    zero_tensors=False,
-+                                    is_causal=False,
-+                                    return_softmax=False,
-+                                    window_size_left=-1,
-+                                    window_size_right=-1,
-+                                    gen_=None,
-+                                    logits_soft_cap=0
-+                                    )
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from torch.nn import LayerNorm
++from transformers.modeling_utils import PreTrainedModel
++from transformers.models.qwen2_vl import Qwen2VLProcessor
 +
-+            context_layer = rearrange(output,
-+                                      "(b s) ... -> b s ...",
-+                                      b=batch_size)
-+            # outputs = []
-+            # for i in range(1, len(cu_seqlens)):
-+            #     start_idx = cu_seqlens[i - 1]
-+            #     end_idx = cu_seqlens[i]
-+            #     q_i = q[:, start_idx:end_idx]
-+            #     k_i = k[:, start_idx:end_idx]
-+            #     v_i = v[:, start_idx:end_idx]
-+            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
-+            #                      for x in [q_i, k_i, v_i])
-+            #     output_i = F.scaled_dot_product_attention(q_i,
-+            #                                               k_i,
-+            #                                               v_i,
-+            #                                               dropout_p=0.0)
-+            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
-+            #     outputs.append(output_i)
-+            # context_layer = torch.cat(outputs, dim=1)
-         elif self.attn_backend == _Backend.XFORMERS:
-             from xformers import ops as xops
-             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -730,7 +760,7 @@ class Glm4vVisionTransformer(nn.Module):
-     ) -> tuple[Optional[int], Optional[list[int]]]:
-         max_seqlen, seqlens = None, None
-         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
--        if self.attn_backend == _Backend.FLASH_ATTN:
-+        if self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.IPEX:
-             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-         return max_seqlen, seqlens
- 
-diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
-index bdca293d2..8fa5d8b9b 100644
---- a/vllm/model_executor/models/glm4_moe.py
-+++ b/vllm/model_executor/models/glm4_moe.py
-@@ -53,7 +53,7 @@ from vllm.model_executor.model_loader.weight_utils import (
- from vllm.model_executor.sampling_metadata import SamplingMetadata
- from vllm.sequence import IntermediateTensors
- 
--from .interfaces import SupportsPP
-+from .interfaces import SupportsLoRA, SupportsPP
- from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                     make_empty_intermediate_tensors_factory, make_layers,
-                     maybe_prefix)
-@@ -118,16 +118,17 @@ class Glm4MoE(nn.Module):
-         if config.hidden_act != "silu":
-             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                              "Only silu is supported for now.")
--
--        self.gate = ReplicatedLinear(config.hidden_size,
--                                     config.n_routed_experts,
--                                     bias=False,
--                                     quant_config=None,
--                                     prefix=f"{prefix}.gate")
--
--        # noaux_tc is not set in transformers new config now
--        self.gate.e_score_correction_bias = (nn.Parameter(
--            torch.empty(config.n_routed_experts)))
-+        # NOTE In the transformers implementation, the gate isn't an nn.Linear,
-+        # so we cannot use ReplicatedLinear here.
-+        # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
-+        self.gate = nn.Linear(
-+            config.hidden_size,
-+            config.n_routed_experts,
-+            bias=False,
-+            dtype=torch.float32,
-+        )
-+        self.gate.e_score_correction_bias = nn.Parameter(
-+            torch.empty(config.n_routed_experts, dtype=torch.float32))
- 
-         # Load balancing settings.
-         vllm_config = get_current_vllm_config()
-@@ -181,7 +182,7 @@ class Glm4MoE(nn.Module):
- 
-         if self.n_shared_experts is not None:
-             shared_output = self.shared_experts(hidden_states)
--        router_logits, _ = self.gate(hidden_states)
-+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
-         final_hidden_states = self.experts(
-             hidden_states=hidden_states,
-             router_logits=router_logits) * self.routed_scaling_factor
-@@ -372,7 +373,13 @@ class Glm4MoeDecoderLayer(nn.Module):
-         return hidden_states, residual
- 
- 
--@support_torch_compile
-+@support_torch_compile(
-+    dynamic_arg_dims={
-+        "input_ids": 0,
-+        "positions": -1,
-+        "intermediate_tensors": 0,
-+        "inputs_embeds": 0,
-+    })
- class Glm4MoeModel(nn.Module):
- 
-     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-@@ -390,7 +397,6 @@ class Glm4MoeModel(nn.Module):
-             self.embed_tokens = VocabParallelEmbedding(
-                 config.vocab_size,
-                 config.hidden_size,
--                quant_config=quant_config,
-                 prefix=f"{prefix}.embed_tokens")
-         else:
-             self.embed_tokens = PPMissingLayer()
-@@ -462,6 +468,15 @@ class Glm4MoeModel(nn.Module):
-                         device=device),
-         })
- 
-+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-+        # Params for weights, fp8 weight scales, fp8 activation scales
-+        # (param_name, weight_name, expert_id, shard_id)
-+        return FusedMoE.make_expert_params_mapping(
-+            ckpt_gate_proj_name="gate_proj",
-+            ckpt_down_proj_name="down_proj",
-+            ckpt_up_proj_name="up_proj",
-+            num_experts=self.config.n_routed_experts)
-+
-     def load_weights(self, weights: Iterable[tuple[str,
-                                                    torch.Tensor]]) -> set[str]:
-         stacked_params_mapping = [
-@@ -473,16 +488,9 @@ class Glm4MoeModel(nn.Module):
-             ("gate_up_proj", "up_proj", 1),
-         ]
- 
--        # Params for weights, fp8 weight scales, fp8 activation scales
--        # (param_name, weight_name, expert_id, shard_id)
--        expert_params_mapping = FusedMoE.make_expert_params_mapping(
--            ckpt_gate_proj_name="gate_proj",
--            ckpt_down_proj_name="down_proj",
--            ckpt_up_proj_name="up_proj",
--            num_experts=self.config.n_routed_experts)
--
-         params_dict = dict(self.named_parameters())
-         loaded_params: set[str] = set()
-+        expert_params_mapping = self.get_expert_mapping()
-         for name, loaded_weight in weights:
-             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-             if spec_layer is not None:
-@@ -571,7 +579,7 @@ class Glm4MoeModel(nn.Module):
-         return loaded_params
- 
- 
--class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-+class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-     packed_modules_mapping = {
-         "qkv_proj": [
-             "q_proj",
-@@ -600,8 +608,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-                                           quant_config=quant_config)
-         else:
-             self.lm_head = PPMissingLayer()
--        if self.config.tie_word_embeddings:
--            self.lm_head.weight = self.model.embed_tokens.weight
-         self.logits_processor = LogitsProcessor(config.vocab_size)
-         self.make_empty_intermediate_tensors = (
-             self.model.make_empty_intermediate_tensors)
-@@ -613,14 +619,19 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-         self.num_expert_groups = config.n_group
- 
-         self.moe_layers: list[FusedMoE] = []
-+        example_moe = None
-         for layer in self.model.layers:
-+            if isinstance(layer, PPMissingLayer):
-+                continue
-             assert isinstance(layer, Glm4MoeDecoderLayer)
-             if isinstance(layer.mlp, Glm4MoE):
-+                # Pick last one layer since the first ones may be dense layers.
-+                example_moe = layer.mlp
-                 self.moe_layers.append(layer.mlp.experts)
- 
-         # Pick last one layer since the first ones may be dense layers.
--        example_moe = typing.cast(
--            Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp)
-+        if example_moe is None:
-+            raise RuntimeError("No Glm4MoE layer found in model.layers.")
-         self.num_logical_experts = example_moe.n_logical_experts
-         self.num_physical_experts = example_moe.n_physical_experts
-         self.num_local_physical_experts = example_moe.n_local_physical_experts
-@@ -673,6 +684,10 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
-         return loader.load_weights(weights)
- 
- 
-+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-+        return self.model.get_expert_mapping()
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.models.interfaces import (MultiModalEmbeddings,
++                                                   SupportsMultiModal,
++                                                   SupportsPP)
++from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
++from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder,
++                                                 Qwen2VLMultiModalProcessor,
++                                                 Qwen2VLProcessingInfo)
++from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
++                                              init_vllm_registered_model,
++                                              maybe_prefix,
++                                              merge_multimodal_embeddings)
++from vllm.model_executor.models.vision import get_vit_attn_backend
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import MultiModalDataDict
++from vllm.platforms import _Backend
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.dotsocr import (DotsOCRConfig,
++                                                     DotsVisionConfig)
 +
++IMAGE_TOKEN = "<|imgpad|>"
 +
- def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
-                                         weight_name: str) -> Optional[int]:
-     if hasattr(config,
-diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
-index 7584b5188..79d2700ac 100644
---- a/vllm/model_executor/models/glm4v.py
-+++ b/vllm/model_executor/models/glm4v.py
-@@ -18,6 +18,7 @@ from transformers.image_utils import ImageInput
- from transformers.tokenization_utils_base import TextInput
- 
- from vllm.attention.layer import MultiHeadAttention
-+from vllm.attention.layer import SelfMultiHeadAttention
- from vllm.config import VllmConfig
- from vllm.distributed import get_tensor_model_parallel_world_size
- from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
-@@ -112,7 +113,9 @@ class EVA2CLIPAttention(nn.Module):
-             prefix=f"{prefix}.dense",
-         )
- 
--        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-+        # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-+        #                                self.scale)
-+        self.attn = SelfMultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-                                        self.scale)
-         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
- 
-diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
-index 9e27200fb..ab8bd737f 100644
---- a/vllm/model_executor/models/idefics2_vision_model.py
-+++ b/vllm/model_executor/models/idefics2_vision_model.py
-@@ -27,6 +27,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
-     Idefics2Config, Idefics2VisionConfig)
- 
- from vllm.attention.layer import MultiHeadAttention
-+from vllm.attention.layer import SelfMultiHeadAttention
- from vllm.distributed import divide, get_tensor_model_parallel_world_size
- from vllm.model_executor.layers.activation import get_act_fn
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-@@ -146,8 +147,10 @@ class Idefics2VisionAttention(nn.Module):
-         )
-         self.tp_size = get_tensor_model_parallel_world_size()
-         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
--        self.attn = MultiHeadAttention(self.num_heads_per_partition,
--                                       self.head_dim, self.scale)
-+        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
-+        #                                self.head_dim, self.scale)
-+        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition, self.head_dim,
-+                                       self.scale)
- 
-     def forward(
-         self,
-diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
-new file mode 100644
-index 000000000..ab21cbe91
---- /dev/null
-+++ b/vllm/model_executor/models/interns1.py
-@@ -0,0 +1,832 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 +
-+# --------------------------------------------------------
-+# InternS1
-+# Copyright (c) 2025 Shanghai AI Lab
-+# Licensed under The MIT License [see LICENSE for details]
-+# --------------------------------------------------------
-+from collections.abc import Iterable, Mapping, Sequence
-+from typing import Literal, Optional, TypedDict, Union
++class DotsOCRImagePixelInputs(TypedDict):
++    type: Literal["pixel_values", "image_grid_thw"]
 +
-+import regex as re
-+import torch
-+import torch.nn as nn
-+from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
-+from transformers.activations import ACT2FN
-+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
-+    GotOcr2ImageProcessorFast)
++    pixel_values: torch.Tensor
++    image_grid_thw: torch.Tensor
++
++
++class DotsOCRImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds", "image_grid_thw"]
++    image_embeds: torch.Tensor
++    """Supported types:
++    - List[`torch.Tensor`]: A list of tensors holding all images' features.
++        Each tensor holds an image's features.
++    - `torch.Tensor`: A tensor holding all images' features
++        (concatenation of all images' feature tensors).
++    Tensor shape: `(num_image_features, hidden_size)`
++    - `num_image_features` varies based on
++        the number and resolution of the images.
++    - `hidden_size` must match the hidden size of language model backbone.
++    """
 +
-+from vllm.config import VllmConfig
-+from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.models.interns1_vit import InternS1VisionModel
-+from vllm.model_executor.models.module_mapping import MultiModelKeys
-+from vllm.model_executor.sampling_metadata import SamplingMetadata
-+from vllm.multimodal import MULTIMODAL_REGISTRY
-+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-+                                    MultiModalKwargs, NestedTensors)
-+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-+                                   ImageSize, MultiModalDataItems)
-+from vllm.multimodal.processing import (BaseMultiModalProcessor,
-+                                        BaseProcessingInfo, PromptReplacement,
-+                                        PromptUpdate, PromptUpdateDetails)
-+from vllm.multimodal.profiling import BaseDummyInputsBuilder
-+from vllm.sequence import IntermediateTensors
++    image_grid_thw: torch.Tensor
 +
-+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-+                         SupportsMultiModal, SupportsPP)
-+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-+                    init_vllm_registered_model, maybe_prefix,
-+                    merge_multimodal_embeddings)
 +
++DotsOCRImageInputs = Union[DotsOCRImagePixelInputs,
++                           DotsOCRImageEmbeddingInputs]
 +
-+class InternS1MultiModalProjector(nn.Module):
 +
-+    def __init__(self, config):
-+        super().__init__()
-+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size *
-+                                       int(1 / config.downsample_ratio)**2)
-+        self.linear_1 = nn.Linear(
-+            config.vision_config.hidden_size *
-+            int(1 / config.downsample_ratio)**2,
-+            config.text_config.hidden_size)
-+        self.act = ACT2FN[config.projector_hidden_act]
-+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
-+                                  config.text_config.hidden_size)
-+
-+    def forward(self, image_features):
-+        hidden_states = self.layer_norm(image_features)
-+        hidden_states = self.linear_1(hidden_states)
-+        hidden_states = self.act(hidden_states)
-+        hidden_states = self.linear_2(hidden_states)
-+        return hidden_states
++class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
 +
++    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
++        num_images = mm_counts.get("image", 0)
++        return IMAGE_TOKEN * num_images
 +
-+class InternS1ImagePixelInputs(TypedDict):
-+    type: Literal["pixel_values"]
-+    pixel_values: torch.Tensor
-+    """
-+    Shape:
-+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
-+    """
++    def get_dummy_mm_data(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> MultiModalDataDict:
++        num_images = mm_counts.get("image", 0)
 +
++        target_width, target_height = self.info.get_image_size_with_most_features(  # noqa: E501
++        )
 +
-+class InternS1ImageEmbeddingInputs(TypedDict):
-+    type: Literal["image_embeds"]
-+    data: Union[torch.Tensor, list[torch.Tensor]]
-+    """
-+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
++        return {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++        }
 +
-+    `hidden_size` must match the hidden size of language model backbone.
-+    """
 +
++class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
 +
-+InternS1ImageInputs = Union[InternS1ImagePixelInputs,
-+                            InternS1ImageEmbeddingInputs]
++    def get_hf_config(self) -> DotsOCRConfig:
++        config = self.ctx.get_hf_config()
++        if not config.__class__.__name__ == 'DotsOCRConfig':
++            raise TypeError(f"Expected DotsOCRConfig, got {type(config)}")
 +
++        if hasattr(config, "vision_config") and isinstance(
++                config.vision_config, dict):
++            config.vision_config = DotsVisionConfig(**config.vision_config)
 +
-+class InternS1VideoPixelInputs(TypedDict):
-+    type: Literal["pixel_values_videos"]
-+    pixel_values: torch.Tensor
-+    """
-+    Shape:
-+    `(batch_size * num_video * num_frames, num_channels, height, width)`
-+    """
++        return config
 +
-+    num_patches: torch.Tensor
-+    """Shape: `(batch_size * num_images)`"""
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
 +
++    def get_mm_max_tokens_per_item(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> Mapping[str, int]:
++        max_image_tokens = self.get_max_image_tokens()
++        return {"image": max_image_tokens}
 +
-+class InternS1VideoEmbeddingInputs(TypedDict):
-+    type: Literal["video_embeds"]
-+    data: Union[torch.Tensor, list[torch.Tensor]]
-+    """
-+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
++    def get_hf_processor(
++        self,
++        **kwargs: object,
++    ) -> Qwen2VLProcessor:
++        self.get_tokenizer(
++        ).image_token = IMAGE_TOKEN  # Ensure image token is set
++        processor = self.ctx.get_hf_processor(
++            Qwen2VLProcessor,
++            **kwargs,
++        )
++        processor.image_token = IMAGE_TOKEN
++        processor.video_token = "<|video_pad|>"
++        return processor
 +
-+    `hidden_size` must match the hidden size of language model backbone.
-+    """
 +
++def rotate_half(x):
++    """Rotates half the hidden dims of the input."""
++    x1 = x[..., :x.shape[-1] // 2]
++    x2 = x[..., x.shape[-1] // 2:]
++    return torch.cat((-x2, x1), dim=-1)
 +
-+InternS1VideoInputs = Union[InternS1VideoPixelInputs,
-+                            InternS1VideoEmbeddingInputs]
 +
++def apply_rotary_pos_emb_vision(tensor: torch.Tensor,
++                                freqs: torch.Tensor) -> torch.Tensor:
++    orig_dtype = tensor.dtype
++    tensor = tensor.float()
 +
-+def resolve_interns1_min_max_num(
-+    min_dynamic_patch: int,
-+    max_dynamic_patch: int,
-+    dynamic_image_size: bool,
-+    use_thumbnail: bool,
-+) -> tuple[int, int]:
-+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
++    cos = freqs.cos()
++    sin = freqs.sin()
 +
-+    if use_thumbnail and max_dynamic_patch != 1:
-+        max_dynamic_patch += 1
++    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
++    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
 +
-+    return min_dynamic_patch, max_dynamic_patch
++    output = (tensor * cos) + (rotate_half(tensor) * sin)
 +
++    output = output.to(orig_dtype)
 +
-+def get_interns1_target_ratios(
-+    min_num: int,
-+    max_num: int,
-+) -> list[tuple[int, int]]:
-+    target_ratios = {(i, j)
-+                     for n in range(min_num, max_num + 1)
-+                     for i in range(1, n + 1)
-+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
-+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
++    return output
 +
 +
-+class InternS1ProcessingInfo(BaseProcessingInfo):
-+    """ProcessingInfo for InternS1-style models."""
++class VisionRotaryEmbedding(nn.Module):
 +
-+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-+        return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
++    def __init__(self, dim: int, theta: float = 10000.0) -> None:
++        super().__init__()
++        inv_freq = 1.0 / (theta
++                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
++        self.register_buffer("inv_freq", inv_freq, persistent=False)
 +
-+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-+        return {"image": None, "video": None}
++    def forward(self, seqlen: int) -> torch.Tensor:
++        seq = torch.arange(seqlen,
++                           device=self.inv_freq.device,
++                           dtype=self.inv_freq.dtype)
++        freqs = torch.outer(seq, self.inv_freq)
++        return freqs
 +
-+    def get_num_image_tokens(
++
++class PatchMerger(nn.Module):
++
++    def __init__(
 +        self,
-+        *,
-+        image_width: int,
-+        image_height: int,
-+        processor: Optional['GotOcr2ImageProcessorFast'] = None,
-+    ) -> int:
-+        if processor is None:
-+            processor = self.get_hf_processor().image_processor
-+
-+        if not isinstance(processor, GotOcr2ImageProcessorFast):
-+            raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
-+                             f'{type(processor)}')
-+        num_image_patches = processor.get_number_of_image_tokens(
-+            image_height, image_width, images_kwargs=dict())
-+        num_image_tokens = self.get_hf_processor(
-+        ).image_seq_length * num_image_patches
-+        return num_image_tokens
-+
-+    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
-+        image_processor = self.get_hf_processor().image_processor
-+        min_dynamic_patch = image_processor.min_patches
-+        max_dynamic_patch = image_processor.max_patches
-+        # HF format's InternVL processor uses `crop_to_patches` which is
-+        # equivalent to `use_thumbnail` in original format.
-+        use_thumbnail = image_processor.crop_to_patches
-+        dynamic_image_size = True
-+        min_num, max_num = resolve_interns1_min_max_num(
-+            min_dynamic_patch,
-+            max_dynamic_patch,
-+            dynamic_image_size,
-+            use_thumbnail=use_thumbnail)
-+
-+        return get_interns1_target_ratios(min_num, max_num)
-+
-+    def get_image_size_with_most_features(self) -> ImageSize:
-+        processor = self.get_hf_processor()
-+
-+        hf_config = self.ctx.get_hf_config()
-+        base_height, base_width = hf_config.vision_config.image_size
-+        target_ratios = self.resolve_target_ratios()
-+
-+        largest_feature_size, largest_feature_pinpoint = 0, None
-+        for wr, hr in target_ratios:
-+            width, height = base_width * wr, base_height * hr
-+
-+            feat_size = self.get_num_image_tokens(
-+                image_width=width,
-+                image_height=height,
-+                processor=processor.image_processor,
-+            )
-+            if feat_size > largest_feature_size:
-+                largest_feature_size = feat_size
-+                largest_feature_pinpoint = ImageSize(width=width,
-+                                                     height=height)
++        dim: int,
++        context_dim: int,
++        spatial_merge_size: int = 2,
++        pre_norm="layernorm",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++        self.pre_norm = pre_norm
++        if self.pre_norm == "layernorm":
++            self.ln_q = LayerNorm(context_dim, eps=1e-6)
++        elif self.pre_norm == "rmsnorm":
++            self.ln_q = RMSNorm(context_dim, eps=1e-6)
++        else:
++            print("no norm in patch merger")
++
++        self.mlp = nn.Sequential(
++            ColumnParallelLinear(self.hidden_size,
++                                 self.hidden_size,
++                                 bias=True,
++                                 return_bias=False,
++                                 disable_tp=True),
++            nn.GELU(),
++            RowParallelLinear(self.hidden_size,
++                              dim,
++                              bias=True,
++                              return_bias=False,
++                              disable_tp=True),
++        )
 +
-+        assert not (largest_feature_size == 0 or largest_feature_pinpoint
-+                    is None), ("Cannot have a largest feature size of 0!")
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.pre_norm:
++            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
++        else:
++            x = self.mlp(x.view(-1, self.hidden_size))
++        return x
 +
-+        return largest_feature_pinpoint
 +
-+    def get_max_image_tokens(self) -> int:
-+        processor = self.get_hf_processor()
-+        target_width, target_height = self.get_image_size_with_most_features()
++class DotsVisionAttention(nn.Module):
 +
-+        return self.get_num_image_tokens(
-+            image_width=target_width,
-+            image_height=target_height,
-+            processor=processor.image_processor,
-+        )
++    def __init__(self,
++                 config,
++                 dim: int,
++                 num_heads: int = 16,
++                 bias: bool = True,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
++        super().__init__()
++        from vllm.distributed import (parallel_state,
++                                      tensor_model_parallel_all_gather)
++        from vllm.distributed import utils as dist_utils
++
++        self.embed_dim = dim
++        self.num_heads = num_heads
++        self.head_dim = dim // num_heads
++        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
++        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
++        self.num_heads_per_partition = dist_utils.divide(
++            num_heads, self.tp_size)
++
++        # qkv/proj follow Qwen2-VL style; bias controlled by arg
++        self.qkv = QKVParallelLinear(hidden_size=dim,
++                                     head_size=dim // num_heads,
++                                     total_num_heads=num_heads,
++                                     bias=bias,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.qkv")
++        self.proj = RowParallelLinear(input_size=dim,
++                                      output_size=dim,
++                                      bias=bias,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.proj")
++        self._all_gather = tensor_model_parallel_all_gather
++        self._split_last = dist_utils.split_tensor_along_last_dim
++
++        # Select attention backend
++        self.attn_backend = get_vit_attn_backend(self.head_dim,
++                                                 torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++                check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++        if self.attn_backend not in {
++                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
++        }:
++            raise RuntimeError(
++                f"Unsupported vision attention backend: {self.attn_backend}")
++        self.is_flash_attn_backend = self.attn_backend in {
++            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
++        }
 +
-+    def get_num_frames_with_most_features(
++    def _split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
++        # qkv: [S, B, 3*dim]
++        seq_len, bs, _ = qkv.shape
++        if self.tp_size > 1:
++            qkv = self._all_gather(qkv)
++        q, k, v = qkv.chunk(3, dim=2)
++        if self.tp_size > 1:
++            q = self._split_last(q, num_partitions=self.tp_size)[self.tp_rank]
++            k = self._split_last(k, num_partitions=self.tp_size)[self.tp_rank]
++            v = self._split_last(v, num_partitions=self.tp_size)[self.tp_rank]
++        new_shape = (seq_len, bs, self.num_heads_per_partition, self.head_dim)
++        return (q.view(*new_shape), k.view(*new_shape), v.view(*new_shape))
++
++    def forward(
 +        self,
-+        seq_len: int,
-+        mm_counts: Mapping[str, int],
-+    ) -> int:
-+        max_images = mm_counts.get("image", 0)
-+        max_videos = mm_counts.get("video", 0)
++        hidden_states: torch.Tensor,
++        cu_seqlens: torch.Tensor,
++        rotary_pos_emb: Optional[torch.Tensor] = None,
++        *,
++        max_seqlen: Optional[int] = None,
++        seqlens: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++        # [S, C] -> [S, B=1, C]
++        x = hidden_states.unsqueeze(1)
++        x, _ = self.qkv(x)
++        q, k, v = self._split_qkv(x)
++        bs = q.shape[1]
++        # [S,B,H,D] -> [B,S,H,D]
++        q = q.permute(1, 0, 2, 3).contiguous()
++        k = k.permute(1, 0, 2, 3).contiguous()
++        v = v.permute(1, 0, 2, 3).contiguous()
++
++        if rotary_pos_emb is not None:
++            qk_concat = torch.cat([q, k], dim=0)
++            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
++            q, k = torch.chunk(qk_rotated, 2, dim=0)
++
++        if self.is_flash_attn_backend:
++            if self.attn_backend == _Backend.ROCM_AITER_FA:
++                from aiter import flash_attn_varlen_func
++            else:
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
++            q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
++            k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
++            v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
++            output = flash_attn_varlen_func(q_,
++                                            k_,
++                                            v_,
++                                            cu_seqlens_q=cu_seqlens,
++                                            cu_seqlens_k=cu_seqlens,
++                                            max_seqlen_q=max_seqlen,
++                                            max_seqlen_k=max_seqlen,
++                                            dropout_p=0.0,
++                                            causal=False)
++            context_layer = output.view(bs, -1, self.num_heads_per_partition,
++                                        self.head_dim)
++        elif self.attn_backend == _Backend.TORCH_SDPA:
++            outputs = []
++            for i in range(1, len(cu_seqlens)):
++                s = int(cu_seqlens[i - 1])
++                e = int(cu_seqlens[i])
++                q_i = q[:, s:e].permute(0, 2, 1, 3)
++                k_i = k[:, s:e].permute(0, 2, 1, 3)
++                v_i = v[:, s:e].permute(0, 2, 1, 3)
++                out_i = F.scaled_dot_product_attention(q_i,
++                                                       k_i,
++                                                       v_i,
++                                                       dropout_p=0.0)
++                out_i = out_i.permute(0, 2, 1, 3)
++                outputs.append(out_i)
++            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
++        elif self.attn_backend == _Backend.XFORMERS:
++            from xformers import ops as xops
++            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
++            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
++                                                       kv_seqlen=None,
++                                                       device=q.device)
++            context_layer = xops.memory_efficient_attention_forward(
++                q, k, v, attn_bias=attn_bias, p=0, scale=None)
++        elif self.attn_backend == _Backend.IPEX:
++            q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3])
++            k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3])
++            v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3])
++            output = torch.empty_like(q_)
 +
-+        processor = self.get_hf_processor()
++            from vllm._ipex_ops import ipex_ops
++            ipex_ops.varlen_attention(
++                q_.contiguous(),                   # query
++                k_.contiguous(),                   # key
++                v_.contiguous(),                   # value
++                output,                      # out
++                cu_seqlens.int(),                 # seqlen_q
++                cu_seqlens.int(),                 # seqlen_k
++                None,                             # alibi_slopes
++                max_seqlen,                       # max_seqlen_q
++                max_seqlen,                       # max_seqlen_k
++                0.0,                              # pdropout
++                1.0 / (q.shape[-1] ** 0.5),       # softmax_scale
++                False,                            # zero_tensors
++                False,                            # is_causal
++                False,                            # return_softmax
++                None,                             # gen_
++                -1,                               # window_size_left
++                -1,                               # window_size_right
++                -1,                               # logits_soft_cap
++            )
++            context_layer = output.view(bs, -1, self.num_heads_per_partition,
++                                        self.head_dim)
++        else:
++            raise RuntimeError("Unsupported attention backend")
 +
-+        max_image_tokens = self.get_max_image_tokens() * max_images
-+        max_total_frames = (seq_len -
-+                            max_image_tokens) // processor.image_seq_length
-+        max_frames_per_video = max_total_frames // max(max_videos, 1)
++        # [B,S,H,D] -> [S,B,H*D] -> [S, C]
++        context_layer = context_layer.permute(1, 0, 2, 3).contiguous()
++        context_layer = context_layer.view(context_layer.shape[0], bs, -1)
++        out, _ = self.proj(context_layer)
++        return out.squeeze(1)
 +
-+        return max(max_frames_per_video, 1)
 +
++class DotsSwiGLUFFN(nn.Module):
 +
-+class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
-+                                 ):
-+    """DummyInputsBuilder for InternS1-style models."""
++    def __init__(self,
++                 config,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        hidden_features = config.intermediate_size
++        in_features = config.embed_dim
++        bias = config.use_bias
++
++        # Referenced aimv2.py AIMv2SwiGLUFFN
++        self.fc13 = MergedColumnParallelLinear(in_features,
++                                               [hidden_features] * 2,
++                                               bias=bias,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.fc13",
++                                               disable_tp=True)
++        self.fc2 = RowParallelLinear(hidden_features,
++                                     in_features,
++                                     bias=bias,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2",
++                                     disable_tp=True)
++        self.act_fn = SiluAndMul()
 +
-+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-+        num_images = mm_counts.get("image", 0)
-+        num_videos = mm_counts.get("video", 0)
-+        image_token = self.info.get_hf_processor().image_token
-+        video_token = self.info.get_hf_processor().video_token
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x, _ = self.fc13(x)
++        x = self.act_fn(x)
++        x, _ = self.fc2(x)
++        return x
 +
-+        return image_token * num_images + video_token * num_videos
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        params = dict(self.named_parameters())
++        loaded: set[str] = set()
++        for name, w in weights:
++            # Map fc1 -> fc13 (shard 0)
++            if name.startswith("fc1."):
++                tgt = name.replace("fc1.", "fc13.")
++                if tgt in params:
++                    params[tgt].weight_loader(params[tgt], w, 0)
++                    loaded.add(tgt)
++                continue
++            # Map fc3 -> fc13 (shard 1)
++            if name.startswith("fc3."):
++                tgt = name.replace("fc3.", "fc13.")
++                if tgt in params:
++                    params[tgt].weight_loader(params[tgt], w, 1)
++                    loaded.add(tgt)
++                continue
++            # Pass-through for fc2 and others
++            if name in params:
++                params[name].weight_loader(params[name], w)
++                loaded.add(name)
++        return loaded
 +
-+    def get_dummy_mm_data(
-+        self,
-+        seq_len: int,
-+        mm_counts: Mapping[str, int],
-+    ) -> MultiModalDataDict:
-+        target_width, target_height = \
-+            self.info.get_image_size_with_most_features()
-+        target_num_frames = \
-+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
-+        num_images = mm_counts.get("image", 0)
-+        num_videos = mm_counts.get("video", 0)
 +
-+        config = self.info.get_hf_config()
-+        image_size_h, image_size_w = config.vision_config.image_size
++class DotsPatchEmbed(nn.Module):
 +
-+        return {
-+            "image":
-+            self._get_dummy_images(width=target_width,
-+                                   height=target_height,
-+                                   num_images=num_images),
-+            "video":
-+            self._get_dummy_videos(width=image_size_w,
-+                                   height=image_size_h,
-+                                   num_frames=target_num_frames,
-+                                   num_videos=num_videos),
-+        }
++    def __init__(self, config):
++        super().__init__()
++        self.num_channels = config.num_channels
++        self.patch_size = config.patch_size
++        self.temporal_patch_size = config.temporal_patch_size
++        self.embed_dim = config.embed_dim
++        self.config = config
++        self.proj = nn.Conv2d(
++            config.num_channels,
++            config.embed_dim,
++            kernel_size=(config.patch_size, config.patch_size),
++            stride=(config.patch_size, config.patch_size),
++        )
++        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
 +
++    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
++        x = x.view(-1, self.num_channels, self.temporal_patch_size,
++                   self.patch_size, self.patch_size)[:, :, 0]
++        x = self.proj(x).view(-1, self.embed_dim)
++        x = self.norm(x)
++        return x
 +
-+class InternS1MultiModalProcessor(
-+        BaseMultiModalProcessor[InternS1ProcessingInfo]):
-+    """ Basic image-only MultiModalProcessor for InternS1-style models."""
 +
-+    def _call_hf_processor(
-+        self,
-+        prompt: str,
-+        mm_data: Mapping[str, object],
-+        mm_kwargs: Mapping[str, object],
-+        tok_kwargs: Mapping[str, object],
-+    ) -> Mapping[str, NestedTensors]:
-+        mm_data = dict(mm_data)
-+        videos = mm_data.pop("videos", [])
-+        images = mm_data.pop("images", [])
-+        assert isinstance(videos, list)
-+        assert isinstance(images, list)
-+
-+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-+        tokenizer = hf_processor.tokenizer
-+        video_token_id = tokenizer.encode(hf_processor.video_token,
-+                                          add_special_tokens=False)
-+        assert len(video_token_id) == 1
-+        video_token_id = video_token_id[0]
-+
-+        prompt = re.sub(hf_processor.image_token, "<image_placeholder>",
-+                        prompt)
-+        prompt = re.sub(hf_processor.video_token, "<video_placeholder>",
-+                        prompt)
-+
-+        image_outputs = {}
-+        if images:
-+            image_pixel_values = []
-+            for image in images:
-+                processed_outputs = super()._call_hf_processor(
-+                    prompt=hf_processor.image_token,
-+                    mm_data={"images": image},
-+                    mm_kwargs=mm_kwargs,
-+                    tok_kwargs=tok_kwargs,
-+                )
-+                image_pixel_values.append(
-+                    processed_outputs.pop("pixel_values"))
-+
-+                input_ids = processed_outputs.pop("input_ids")
-+                image_placeholder = tokenizer.batch_decode(input_ids)[0]
-+                prompt = prompt.replace("<image_placeholder>",
-+                                        image_placeholder, 1)
-+
-+            num_patches = [len(item) for item in image_pixel_values]
-+            image_outputs: dict[str, NestedTensors] = {
-+                "pixel_values": torch.concat(image_pixel_values),
-+                "image_num_patches": torch.tensor(num_patches),
-+                "image_token_id": torch.tensor(hf_processor.image_token_id),
-+            }
++class DotsViTPreprocessor(nn.Module):
 +
-+        video_outputs = {}
-+        if videos:
-+            video_pixel_values = []
-+            for video in videos:
-+                processed_outputs = super()._call_hf_processor(
-+                    prompt=hf_processor.video_token,
-+                    mm_data={"videos": video},
-+                    mm_kwargs=mm_kwargs,
-+                    tok_kwargs=tok_kwargs,
-+                )
-+                video_pixel_values.append(
-+                    processed_outputs.pop("pixel_values"))
-+
-+                input_ids = processed_outputs.pop("input_ids")
-+                input_ids[input_ids ==
-+                          hf_processor.image_token_id] = video_token_id
-+
-+                video_placeholder = tokenizer.batch_decode(input_ids)[0]
-+                prompt = prompt.replace("<video_placeholder>",
-+                                        video_placeholder, 1)
-+
-+            num_frames = [len(item) for item in video_pixel_values]
-+            video_outputs: dict[str, NestedTensors] = {
-+                "pixel_values_videos": torch.concat(video_pixel_values),
-+                "video_num_patches": torch.tensor(num_frames),
-+                "video_token_id": torch.tensor(video_token_id),
-+            }
++    def __init__(self, config):
++        super().__init__()
++        self.patch_h = config.patch_size
++        self.patch_w = config.patch_size
++        self.embed_dim = config.embed_dim
++        self.config = config
++        self.patchifier = DotsPatchEmbed(config)
 +
-+        prompt = re.sub("<image_placeholder>", hf_processor.image_token,
-+                        prompt)
-+        prompt = re.sub("<video_placeholder>", hf_processor.video_token,
-+                        prompt)
-+        text_outputs = tokenizer(prompt, **tok_kwargs, return_tensors="pt")
++    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
++        tokens = self.patchifier(x, grid_thw)
++        return tokens
 +
-+        combined_outputs = dict(
-+            **text_outputs,
-+            **image_outputs,
-+            **video_outputs,
-+        )
-+        return BatchFeature(combined_outputs)
 +
-+    def _get_mm_fields_config(
-+        self,
-+        hf_inputs: Mapping[str, NestedTensors],
-+        hf_processor_mm_kwargs: Mapping[str, object],
-+    ) -> Mapping[str, MultiModalFieldConfig]:
++class DotsVisionBlock(nn.Module):
 +
-+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
-+        video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
-+        num_images = len(image_num_patches)
-+        num_videos = len(video_num_patches)
++    def __init__(self,
++                 config,
++                 *,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
 +
-+        return dict(
-+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-+                "image", image_num_patches),
-+            image_num_patches=MultiModalFieldConfig.batched("image"),
-+            image_embeds=MultiModalFieldConfig.batched("image"),
-+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
-+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-+                "video", video_num_patches),
-+            video_num_patches=MultiModalFieldConfig.batched("video"),
-+            video_token_id=MultiModalFieldConfig.shared("video", num_videos),
++        self.attn = DotsVisionAttention(
++            config,
++            config.embed_dim,
++            num_heads=config.num_attention_heads,
++            bias=config.use_bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
 +        )
++        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
++        self.mlp = DotsSwiGLUFFN(config,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.mlp")
++        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
++
++    def forward(self,
++                hidden_states: torch.Tensor,
++                *,
++                cu_seqlens: torch.Tensor,
++                rotary_pos_emb: torch.Tensor,
++                max_seqlen: Optional[int] = None,
++                seqlens: Optional[list[int]] = None) -> torch.Tensor:
++        hidden_states = hidden_states + self.attn(
++            self.norm1(hidden_states),
++            cu_seqlens=cu_seqlens,
++            rotary_pos_emb=rotary_pos_emb,
++            max_seqlen=max_seqlen,
++            seqlens=seqlens,
++        )
++        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
++        return hidden_states
 +
-+    def _get_prompt_updates(
++
++class DotsVisionTransformer(PreTrainedModel):
++
++    def __init__(
 +        self,
-+        mm_items: MultiModalDataItems,
-+        hf_processor_mm_kwargs: Mapping[str, object],
-+        out_mm_kwargs: MultiModalKwargs,
-+    ) -> Sequence[PromptUpdate]:
-+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-+        img_context_token = hf_processor.image_token
-+        start_image_token = hf_processor.start_image_token
-+        end_image_token = hf_processor.end_image_token
-+        video_token = hf_processor.video_token
-+
-+        if "video_num_patches" in out_mm_kwargs:
-+            video_num_patches = out_mm_kwargs["video_num_patches"]
-+            assert isinstance(video_num_patches, torch.Tensor)
-+            video_num_patches = video_num_patches.tolist()
++        config: DotsVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__(config)
++        self.config = config
++        self.spatial_merge_size = config.spatial_merge_size
++
++        self.patch_embed = DotsViTPreprocessor(config)
++
++        head_dim = config.embed_dim // config.num_attention_heads
++        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++                check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++        # Keep blocks for compatibility with other vision towers
++        num_layers = (config.num_hidden_layers if num_hidden_layers_override
++                      is None else num_hidden_layers_override)
++        self.blocks = nn.ModuleList([
++            DotsVisionBlock(config,
++                            quant_config=quant_config,
++                            prefix=f"{prefix}.blocks.{i}")
++            for i in range(num_layers)
++        ])
++        if require_post_norm is None:
++            require_post_norm = (len(self.blocks) == config.num_hidden_layers)
++        if require_post_norm and self.config.post_norm:
++            self.post_trunk_norm = RMSNorm(config.embed_dim,
++                                           eps=config.rms_norm_eps)
 +        else:
-+            video_num_patches = []
++            self.post_trunk_norm = None
 +
-+        if "image_num_patches" in out_mm_kwargs:
-+            image_num_patches = out_mm_kwargs["image_num_patches"]
-+            assert isinstance(image_num_patches, torch.Tensor)
-+            image_num_patches = image_num_patches.tolist()
-+        else:
-+            image_num_patches = []
++        self.merger = PatchMerger(
++            dim=config.hidden_size,
++            context_dim=config.embed_dim,
++            spatial_merge_size=config.spatial_merge_size,
++        )
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.patchifier.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.patchifier.proj.weight.device
++
++    def get_pos_ids_by_grid(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(
++                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++
++        return pos_ids
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = self.get_pos_ids_by_grid(grid_thw)
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def compute_attn_mask_seqlen(
++            self, cu_seqlens: torch.Tensor
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
++
++    def forward(self, hidden_states: torch.Tensor,
++                grid_thw: torch.Tensor) -> torch.Tensor:
++        hidden_states = hidden_states.to(self.dtype)
++        hidden_states = self.patch_embed(hidden_states, grid_thw)
++
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
++                dim=0,
++                dtype=grid_thw.dtype
++                if torch.jit.is_tracing() else torch.int32,
++            )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 +
-+        def get_replacement_interns1_image(item_idx: int):
-+            images = mm_items.get_items(
-+                "image", (ImageEmbeddingItems, ImageProcessorItems))
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
++        for blk in self.blocks:
++            hidden_states = blk(hidden_states,
++                                cu_seqlens=cu_seqlens,
++                                rotary_pos_emb=rotary_pos_emb,
++                                max_seqlen=max_seqlen,
++                                seqlens=seqlens)
 +
-+            if isinstance(images, ImageEmbeddingItems):
-+                feature_size = images.get_feature_size(item_idx)
-+            else:
-+                num_patches = image_num_patches[item_idx]
-+                feature_size = num_patches * hf_processor.image_seq_length
-+
-+            repl_features = img_context_token * feature_size
-+            repl_full = start_image_token + repl_features + end_image_token
-+            return PromptUpdateDetails.select_text(repl_full,
-+                                                   img_context_token)
-+
-+        def get_replacement_interns1_video(item_idx: int):
-+            num_patches = video_num_patches[item_idx]
-+            repl_features = video_token * hf_processor.image_seq_length
-+            repl_features_with_sep = (start_image_token + repl_features +
-+                                      end_image_token)
-+            # num_patches is equal to num_frames
-+            repl_full = '\n'.join([
-+                f'Frame{i+1}: {repl_features_with_sep}'
-+                for i in range(num_patches)
-+            ])
-+
-+            return PromptUpdateDetails.select_text(repl_full, video_token)
++        if self.post_trunk_norm is not None:
++            hidden_states = self.post_trunk_norm(hidden_states)
 +
-+        return [
-+            PromptReplacement(
-+                modality="image",
-+                target=img_context_token,
-+                replacement=get_replacement_interns1_image,
-+            ),
-+            PromptReplacement(
-+                modality="video",
-+                target=video_token,
-+                replacement=get_replacement_interns1_video,
-+            ),
-+        ]
++        hidden_states = self.merger(hidden_states)
++        return hidden_states
 +
 +
 +@MULTIMODAL_REGISTRY.register_processor(
-+    InternS1MultiModalProcessor,
-+    info=InternS1ProcessingInfo,
-+    dummy_inputs=InternS1DummyInputsBuilder)
-+class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
-+                                       SupportsPP, SupportsLoRA):
-+
-+    # To ensure correct weight loading and mapping.
++    Qwen2VLMultiModalProcessor,
++    info=DotsOCRProcessingInfo,
++    dummy_inputs=DotsOCRDummyInputsBuilder,
++)
++class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 +    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_substr={
++            ".attn.qkv_proj.": ".attn.qkv.",
++            ".attn.out_proj.": ".attn.proj.",
++        },
 +        orig_to_new_prefix={
 +            "lm_head.": "language_model.lm_head.",
-+            "model.language_model.": "language_model.model.",
-+            "model.vision_tower.": "vision_tower.",
-+            "model.multi_modal_projector.": "multi_modal_projector.",
-+        })
++            "model.": "language_model.model.",
++        },
++    )
 +
 +    @classmethod
 +    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
-+        # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
 +        if modality.startswith("image"):
-+            return '<IMG_CONTEXT>'
-+        if modality.startswith("video"):
-+            return "<video>"
-+
-+        raise ValueError("Only image or video modality is supported")
++            return "<|img|><|imgpad|><|endofimg|>"
 +
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 +        super().__init__()
-+        config = vllm_config.model_config.hf_config
-+        quant_config = vllm_config.quant_config
-+        multimodal_config = vllm_config.model_config.multimodal_config
 +
-+        self.config = config
-+        self.multimodal_config = multimodal_config
++        self.config: DotsOCRConfig = vllm_config.model_config.hf_config
++        self.quant_config = vllm_config.quant_config
++        self.multimodal_config = vllm_config.model_config.multimodal_config
 +
-+        image_size = config.vision_config.image_size[0]
-+        patch_size = config.vision_config.patch_size[0]
-+        self.patch_size = patch_size
-+        self.num_image_token = int(
-+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
-+        self.downsample_ratio = config.downsample_ratio
++        if isinstance(self.config.vision_config, dict):
++            vision_config = DotsVisionConfig(**self.config.vision_config)
++            self.config.vision_config = vision_config
++        else:
++            vision_config = self.config.vision_config
 +
-+        self.llm_arch_name = config.text_config.architectures[0]
-+        self.vision_tower = self._init_vision_model(
-+            config,
-+            quant_config=quant_config,
++        self.vision_tower = DotsVisionTransformer(
++            vision_config,
++            quant_config=self.quant_config,
 +            prefix=maybe_prefix(prefix, "vision_tower"),
 +        )
-+
-+        self.language_model = init_vllm_registered_model(
++        self.language_model: Qwen2ForCausalLM = init_vllm_registered_model(
 +            vllm_config=vllm_config,
-+            hf_config=config.text_config,
++            hf_config=self.config,
 +            prefix=maybe_prefix(prefix, "language_model"),
++            architectures=["Qwen2ForCausalLM"],
 +        )
 +
-+        self.multi_modal_projector = self._init_mlp1(config)
-+
-+        self.img_context_token_id = None
-+        self.video_context_token_id = None
-+
-+        self.visual_token_mask = None
-+        self.make_empty_intermediate_tensors = (
-+            self.language_model.make_empty_intermediate_tensors)
-+
-+    def _init_vision_model(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig],
-+        *,
-+        prefix: str,
-+    ):
-+        num_hidden_layers = config.vision_config.num_hidden_layers
-+        return InternS1VisionModel(
-+            config.vision_config,
-+            quant_config=quant_config,
-+            num_hidden_layers_override=num_hidden_layers,
-+            prefix=prefix,
-+        )
-+
-+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
-+        return InternS1MultiModalProjector(config)
-+
-+    def pixel_shuffle(self, x, scale_factor=0.5):
-+        n, w, h, c = x.size()
-+        # N, W, H, C --> N, W, H * scale, C // scale
-+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
-+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
-+        x = x.permute(0, 2, 1, 3).contiguous()
-+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
-+                   int(c / (scale_factor * scale_factor)))
-+        x = x.permute(0, 2, 1, 3).contiguous()
-+        return x
-+
-+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
-+        vit_embeds = self.vision_tower(pixel_values=pixel_values)
-+        vit_embeds = vit_embeds[:, 1:, :]
-+
-+        h = w = int(vit_embeds.shape[1]**0.5)
-+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-+        vit_embeds = self.pixel_shuffle(vit_embeds,
-+                                        scale_factor=self.downsample_ratio)
-+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-+                                        vit_embeds.shape[-1])
-+
-+        vit_embeds = self.multi_modal_projector(vit_embeds)
-+        return vit_embeds
-+
-+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-+
-+        h, w = self.config.vision_config.image_size
-+        expected_dims = (3, h, w)
-+
-+        def _validate_shape(d: torch.Tensor):
-+            actual_dims = tuple(d.shape)
-+
-+            if actual_dims != expected_dims:
-+                expected_expr = str(expected_dims)
-+                raise ValueError(
-+                    "The expected shape of pixel values per image per batch "
-+                    f" per patch is {expected_expr}. "
-+                    f"You supplied {tuple(d.shape)}.")
-+
-+        for d in data:
-+            _validate_shape(d)
-+
-+        return data
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            if mm_input.ndim == 2:
++                return mm_input
++            if mm_input.ndim != 3:
++                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
++                                 f"Got ndim: {mm_input.ndim} "
++                                 f"(shape={mm_input.shape})")
++            return torch.concat(list(mm_input))
++        else:
++            return torch.concat(mm_input)
 +
 +    def _parse_and_validate_image_input(
-+            self, **kwargs: object) -> Optional[InternS1ImageInputs]:
++            self, **kwargs: object) -> Optional[DotsOCRImageInputs]:
 +        pixel_values = kwargs.pop("pixel_values", None)
-+        image_num_patches = kwargs.pop("image_num_patches", None)
 +        image_embeds = kwargs.pop("image_embeds", None)
++        image_grid_thw = kwargs.pop("image_grid_thw", None)
 +
 +        if pixel_values is None and image_embeds is None:
 +            return None
 +
-+        if image_embeds is not None:
-+            if not isinstance(image_embeds, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image embeddings. "
-+                                 f"Got type: {type(image_embeds)}")
-+
-+            return InternS1ImageEmbeddingInputs(
-+                type="image_embeds",
-+                data=flatten_bn(image_embeds),
-+            )
-+
-+        image_token_id = kwargs["image_token_id"]
-+        assert isinstance(image_token_id, torch.Tensor)
-+        self.img_context_token_id = image_token_id.flatten().unique().item()
-+
 +        if pixel_values is not None:
++            pixel_values = self._validate_and_reshape_mm_tensor(
++                pixel_values, "image pixel values")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
 +            if not isinstance(pixel_values, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of pixel values. "
++                raise ValueError("Incorrect type of image pixel values. "
 +                                 f"Got type: {type(pixel_values)}")
 +
-+            if not isinstance(image_num_patches, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image_num_patches. "
-+                                 f"Got type: {type(image_num_patches)}")
-+
-+            pixel_values = flatten_bn(pixel_values, concat=True)
-+            image_num_patches = flatten_bn(image_num_patches, concat=True)
-+
-+            return InternS1ImagePixelInputs(
-+                type="pixel_values",
-+                pixel_values=self._validate_pixel_values(pixel_values),
-+                num_patches=image_num_patches,
-+            )
-+
-+        raise AssertionError("This line should be unreachable.")
-+
-+    def _parse_and_validate_video_input(
-+            self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
-+        pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
-+        video_num_patches = kwargs.pop("video_num_patches", None)
-+        video_embeds = kwargs.pop("video_embeds", None)
-+
-+        if pixel_values_flat_video is None and video_embeds is None:
-+            return None
-+
-+        if video_embeds is not None:
-+            if not isinstance(video_embeds, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of video embeddings. "
-+                                 f"Got type: {type(video_embeds)}")
-+
-+            return InternS1ImageEmbeddingInputs(
-+                type="video_embeds",
-+                data=flatten_bn(video_embeds),
-+            )
-+
-+        video_token_id = kwargs["video_token_id"]
-+        assert isinstance(video_token_id, torch.Tensor)
-+        self.video_context_token_id = video_token_id.flatten().unique().item()
++            return DotsOCRImagePixelInputs(type="pixel_values",
++                                           pixel_values=pixel_values,
++                                           image_grid_thw=image_grid_thw)
 +
-+        if pixel_values_flat_video is not None:
-+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of pixel values. "
-+                                 f"Got type: {type(pixel_values_flat_video)}")
-+
-+            if not isinstance(video_num_patches, (torch.Tensor, list)):
-+                raise ValueError("Incorrect type of image_num_patches. "
-+                                 f"Got type: {type(video_num_patches)}")
-+
-+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-+                                                 concat=True)
-+            video_num_patches = flatten_bn(video_num_patches, concat=True)
-+
-+            return InternS1VideoPixelInputs(
-+                type="pixel_values_videos",
-+                pixel_values=self._validate_pixel_values(
-+                    pixel_values_flat_video),
-+                num_patches=video_num_patches,
-+            )
++        if image_embeds is not None:
++            image_embeds = self._validate_and_reshape_mm_tensor(
++                image_embeds, "image embeds")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
 +
-+        raise AssertionError("This line should be unreachable.")
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++            return DotsOCRImageEmbeddingInputs(type="image_embeds",
++                                               image_embeds=image_embeds,
++                                               image_grid_thw=image_grid_thw)
 +
 +    def _process_image_input(
-+        self,
-+        image_input: Union[InternS1ImageInputs, InternS1VideoPixelInputs],
-+    ) -> tuple[torch.Tensor, ...]:
-+        if image_input["type"] == "image_embeds":
-+            return image_input["data"]
-+
-+        assert self.vision_tower is not None
-+
-+        image_embeds = self.extract_feature(image_input["pixel_values"])
++            self, image_input: DotsOCRImageInputs) -> tuple[torch.Tensor, ...]:
++        grid_thw = image_input["image_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
 +
-+        num_patches = image_input["num_patches"]
-+
-+        # Only one image in the current batch
-+        if len(num_patches) == 1:
-+            return (image_embeds.view(-1,
-+                                      self.config.text_config.hidden_size), )
-+
-+        # NOTE: Image embeddings are split into separate tensors for each image
-+        # by the size of each embedding.
-+        feature_size = image_embeds.shape[1]
-+        image_embeds = image_embeds.view(-1,
-+                                         self.config.text_config.hidden_size)
-+        image_feature_sizes = [
-+            num_patches * feature_size for num_patches in num_patches
-+        ]
-+        return image_embeds.split(image_feature_sizes)
-+
-+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-+        modalities = {}
-+
-+        # Preserve the order of modalities if there are multiple of them
-+        # from the order of kwargs.
-+        for input_key in kwargs:
-+            if input_key in ("pixel_values",
-+                             "image_embeds") and "images" not in modalities:
-+                modalities["images"] = self._parse_and_validate_image_input(
-+                    **kwargs)
-+            if input_key in (
-+                    "pixel_values_videos", ) and "videos" not in modalities:
-+                modalities["videos"] = self._parse_and_validate_video_input(
-+                    **kwargs)
++        if image_input["type"] == "image_embeds":
++            image_embeds = image_input["image_embeds"].type(
++                self.vision_tower.dtype)
++        else:
++            pixel_values = image_input["pixel_values"].type(
++                self.vision_tower.dtype)
++            image_embeds = self.vision_tower(
++                pixel_values, grid_thw)[:, :self.config.hidden_size]
 +
-+        return modalities
++        # Split concatenated embeddings for each image item.
++        merge_size = self.vision_tower.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
 +
-+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
-+        self.visual_token_mask = None
++        return image_embeds.split(sizes)
 +
 +    def get_language_model(self) -> torch.nn.Module:
 +        return self.language_model
 +
-+    def get_multimodal_embeddings(self,
-+                                  **kwargs: object) -> MultiModalEmbeddings:
-+
-+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-+        if not modalities:
++    def get_multimodal_embeddings(
++            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
 +            return []
-+
-+        # The result multimodal_embeddings is tuple of tensors, with each
-+        # tensor correspoending to a multimodal data item (image or video).
-+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
-+
-+        # NOTE: It is important to iterate over the keys in this dictionary
-+        # to preserve the order of the modalities.
-+        for modality in modalities:
-+            if modality == "images":
-+                image_input = modalities["images"]
-+                vision_embeddings = self._process_image_input(image_input)
-+                multimodal_embeddings += vision_embeddings
-+            if modality == "videos":
-+                video_input = modalities["videos"]
-+                video_embeddings = self._process_image_input(video_input)
-+                multimodal_embeddings += video_embeddings
-+
-+        return multimodal_embeddings
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
 +
 +    def get_input_embeddings(
 +        self,
@@ -14153,894 +11726,412 @@ index 000000000..ab21cbe91
 +        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
 +    ) -> torch.Tensor:
 +        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-+        if multimodal_embeddings is not None \
-+            and len(multimodal_embeddings) != 0:
-+            context_token_ids = [
-+                token_id for token_id in (self.img_context_token_id,
-+                                          self.video_context_token_id)
-+                if token_id is not None
-+            ]
-+            assert len(context_token_ids) >= 1
-+            self._set_visual_token_mask(input_ids)
++        if multimodal_embeddings is not None:
 +            inputs_embeds = merge_multimodal_embeddings(
 +                input_ids,
 +                inputs_embeds,
 +                multimodal_embeddings,
-+                context_token_ids,
++                self.config.image_token_id,
 +            )
++
 +        return inputs_embeds
 +
 +    def forward(
 +        self,
-+        input_ids: torch.Tensor,
++        input_ids: Optional[torch.Tensor],
 +        positions: torch.Tensor,
 +        intermediate_tensors: Optional[IntermediateTensors] = None,
 +        inputs_embeds: Optional[torch.Tensor] = None,
-+        **kwargs: object,
-+    ) -> IntermediateTensors:
-+
++        **kwargs,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
 +        if intermediate_tensors is not None:
-+            input_ids = None
 +            inputs_embeds = None
++        elif inputs_embeds is None and kwargs.get("pixel_values") is not None:
++            image_input = self._parse_and_validate_image_input(**kwargs)
++            if image_input is None:
++                inputs_embeds = None
++            else:
++                assert input_ids is not None
++                inputs_embeds = self.get_multimodal_embeddings(
++                    input_ids,
++                    image_input=image_input,
++                )
++                input_ids = None
 +
-+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-+        # condition is for v0 compatibility.
-+        elif inputs_embeds is None:
-+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-+            inputs_embeds = self.get_input_embeddings(input_ids,
-+                                                      vision_embeddings)
-+            input_ids = None
-+
-+        forward_kwargs = {
-+            "input_ids": input_ids,
-+            "positions": positions,
-+            "intermediate_tensors": intermediate_tensors,
-+            "inputs_embeds": inputs_embeds,
-+        }
++        hidden_states = self.language_model(
++            input_ids=input_ids,
++            positions=positions,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
 +
-+        hidden_states = self.language_model.model(**forward_kwargs)
 +        return hidden_states
 +
++    # def compute_logits(
++    #     self,
++    #     hidden_states: torch.Tensor,
++    # ) -> Optional[torch.Tensor]:
++    #     return self.language_model.compute_logits(hidden_states)
++
++    from vllm.v1.sample.metadata import SamplingMetadata
 +    def compute_logits(
 +        self,
 +        hidden_states: torch.Tensor,
-+        sampling_metadata: SamplingMetadata,
++        sampling_metadata: Optional[SamplingMetadata] = None,
 +    ) -> Optional[torch.Tensor]:
-+        return self.language_model.compute_logits(hidden_states,
-+                                                  sampling_metadata)
++        return self.language_model.compute_logits(hidden_states, sampling_metadata)
 +
 +    def load_weights(self, weights: Iterable[tuple[str,
 +                                                   torch.Tensor]]) -> set[str]:
 +        loader = AutoWeightsLoader(self)
 +        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-+
-+    def get_mm_mapping(self) -> MultiModelKeys:
-+        """
-+        Get the module prefix in multimodal models
-+        """
-+        return MultiModelKeys.from_string_field(
-+            language_model="language_model",
-+            connector="multi_modal_projector",
-+            tower_model="vision_tower")
-diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
-index f8b9ea2c5..2a9366d87 100644
---- a/vllm/model_executor/models/internvl.py
-+++ b/vllm/model_executor/models/internvl.py
-@@ -9,7 +9,7 @@
- # --------------------------------------------------------
- from abc import ABC, abstractmethod
- from collections.abc import Iterable, Mapping, Sequence
--from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
-+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
- 
- import numpy.typing as npt
- import torch
-@@ -37,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
- from vllm.multimodal.profiling import BaseDummyInputsBuilder
- from vllm.sequence import IntermediateTensors
- from vllm.transformers_utils.tokenizer import AnyTokenizer
-+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
+index 97aace5a2..bcff65a71 100644
+--- a/vllm/model_executor/models/ernie45_vl.py
++++ b/vllm/model_executor/models/ernie45_vl.py
+@@ -34,6 +34,7 @@ import torch.nn.functional as F
+ from einops import rearrange, repeat
+ from transformers import BatchFeature
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state
+ from vllm.distributed import utils as dist_utils
+@@ -170,7 +171,16 @@ class Ernie4_5_VisionAttention(nn.Module):
+                                       prefix=f"{prefix}.proj")
  
- from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                          SupportsMultiModal, SupportsPP)
-@@ -51,54 +52,60 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
- IMAGENET_STD = (0.229, 0.224, 0.225)
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                 _Backend.ROCM_AITER_FA
+@@ -233,7 +243,10 @@ class Ernie4_5_VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
  
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
--class InternVLImagePixelInputs(TypedDict):
--    type: Literal["pixel_values"]
--    pixel_values_flat: torch.Tensor
-+class InternVLImagePixelInputs(TensorSchema):
-     """
--    Shape:
--    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
-+    Dimensions:
-+        - bn: Batch size * number of images
-+        - bnp: Batch size * number of images * (1 + num_patches)
-+        - c: Number of channels (3)
-+        - h: Height of each image patch
-+        - w: Width of each image patch
-     """
-+    type: Literal["pixel_values"]
-+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
-+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+@@ -457,7 +470,11 @@ class Ernie4_5_VisionTransformer(nn.Module):
+                 ), "vit's config.hidden must be equal to config.embed_dim"
+         self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
  
--    num_patches: torch.Tensor
--    """Shape: `(batch_size * num_images)`"""
--
--
--class InternVLImageEmbeddingInputs(TypedDict):
--    type: Literal["image_embeds"]
--    data: Union[torch.Tensor, list[torch.Tensor]]
--    """ 
--    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
--    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++        check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
+index 5e2908a82..adbe89007 100644
+--- a/vllm/model_executor/models/glm4.py
++++ b/vllm/model_executor/models/glm4.py
+@@ -255,6 +255,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+         self.model = Glm4Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+ 
++        if "text_config" in config:
++            config = config.text_config
+         if get_pp_group().is_last_rank:
+             if config.tie_word_embeddings:
+                 self.lm_head = self.model.embed_tokens
+diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
+index 539381b61..a1e699310 100644
+--- a/vllm/model_executor/models/glm4_1v.py
++++ b/vllm/model_executor/models/glm4_1v.py
+@@ -44,6 +44,7 @@ from transformers.models.glm4v.video_processing_glm4v import (
+     Glm4vVideoProcessor)
+ from transformers.video_utils import VideoMetadata
  
--    `hidden_size` must match the hidden size of language model backbone.
-+class InternVLImageEmbeddingInputs(TensorSchema):
-     """
-+    Dimensions:
-+        - n: Number of images
-+        - f: Total image feature size
-+        - h: Hidden size (must match the hidden size of language model backbone)
-+    """
-+    type: Literal["image_embeds"]
-+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-+                    TensorShape("n", "f", "h")]
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import (get_tensor_model_parallel_world_size,
+                               parallel_state)
+@@ -260,7 +261,16 @@ class Glm4vVisionAttention(nn.Module):
+         )
  
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++
++        self.attn_backend = _Backend.TORCH_SDPA
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN,
+                 _Backend.TORCH_SDPA,
+@@ -310,7 +320,10 @@ class Glm4vVisionAttention(nn.Module):
+         if self.attn_backend == _Backend.FLASH_ATTN:
+             # from vllm_flash_attn.flash_attn_interface import (
+             #   flash_attn_varlen_func)
+-            from flash_attn import flash_attn_varlen_func
++            if self.use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
  
- InternVLImageInputs = Union[InternVLImagePixelInputs,
-                             InternVLImageEmbeddingInputs]
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
+@@ -331,22 +344,51 @@ class Glm4vVisionAttention(nn.Module):
+                                       b=batch_size)
+         elif self.attn_backend == _Backend.TORCH_SDPA:
+             # Execute attention entry by entry for speed & less VRAM.
+-            outputs = []
+-            for i in range(1, len(cu_seqlens)):
+-                start_idx = cu_seqlens[i - 1]
+-                end_idx = cu_seqlens[i]
+-                q_i = q[:, start_idx:end_idx]
+-                k_i = k[:, start_idx:end_idx]
+-                v_i = v[:, start_idx:end_idx]
+-                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+-                                 for x in [q_i, k_i, v_i])
+-                output_i = F.scaled_dot_product_attention(q_i,
+-                                                          k_i,
+-                                                          v_i,
+-                                                          dropout_p=0.0)
+-                output_i = rearrange(output_i, "b h s d -> b s h d ")
+-                outputs.append(output_i)
+-            context_layer = torch.cat(outputs, dim=1)
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++            from vllm._ipex_ops import ipex_ops
++            output = torch.empty(
++                        (q.shape[0], q.shape[1], q.shape[2]),
++                        dtype=q.dtype,
++                        device=q.device)
++            import math
++            head_dim = q.shape[-1]
++            scale = 1 / math.sqrt(head_dim)
++            ipex_ops.varlen_attention(q, k, v, output,
++                                    cu_seqlens,
++                                    cu_seqlens,
++                                    None,
++                                    max_seqlen,
++                                    max_seqlen,
++                                    pdropout=0,
++                                    softmax_scale=scale,
++                                    zero_tensors=False,
++                                    is_causal=False,
++                                    return_softmax=False,
++                                    window_size_left=-1,
++                                    window_size_right=-1,
++                                    gen_=None,
++                                    logits_soft_cap=0
++                                    )
++
++            context_layer = rearrange(output,
++                                      "(b s) ... -> b s ...",
++                                      b=batch_size)
++            # outputs = []
++            # for i in range(1, len(cu_seqlens)):
++            #     start_idx = cu_seqlens[i - 1]
++            #     end_idx = cu_seqlens[i]
++            #     q_i = q[:, start_idx:end_idx]
++            #     k_i = k[:, start_idx:end_idx]
++            #     v_i = v[:, start_idx:end_idx]
++            #     q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
++            #                      for x in [q_i, k_i, v_i])
++            #     output_i = F.scaled_dot_product_attention(q_i,
++            #                                               k_i,
++            #                                               v_i,
++            #                                               dropout_p=0.0)
++            #     output_i = rearrange(output_i, "b h s d -> b s h d ")
++            #     outputs.append(output_i)
++            # context_layer = torch.cat(outputs, dim=1)
+         elif self.attn_backend == _Backend.XFORMERS:
+             from xformers import ops as xops
+             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+@@ -715,7 +757,11 @@ class Glm4vVisionTransformer(nn.Module):
+         self.post_layernorm = RMSNorm(vision_config.hidden_size,
+                                       eps=vision_config.rms_norm_eps)
  
--class InternVLVideoPixelInputs(TypedDict):
--    type: Literal["pixel_values_videos"]
--    pixel_values_flat: torch.Tensor
-+class InternVLVideoPixelInputs(TensorSchema):
-     """
--    Shape:
--    `(batch_size * num_video * num_frames, num_channels, height, width)`
-+    Dimensions:
-+        - bvf: Batch size * number of videos * num_frames
-+        - bn: Batch size * number of images
-+        - c: Number of channels (3)
-+        - h: Height of each video frame
-+        - w: Width of each video frame
-     """
-+    type: Literal["pixel_values_videos"]
-+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
-+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -756,7 +802,7 @@ class Glm4vVisionTransformer(nn.Module):
+     ) -> tuple[Optional[int], Optional[list[int]]]:
+         max_seqlen, seqlens = None, None
+         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+-        if self.attn_backend == _Backend.FLASH_ATTN:
++        if self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.IPEX:
+             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+         return max_seqlen, seqlens
  
--    num_patches: torch.Tensor
--    """Shape: `(batch_size * num_images)`"""
--
+diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
+index 1fb457609..d1dc00a62 100644
+--- a/vllm/model_executor/models/glm4_moe.py
++++ b/vllm/model_executor/models/glm4_moe.py
+@@ -626,7 +626,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+         for layer in self.model.layers:
+             if isinstance(layer, PPMissingLayer):
+                 continue
 -
--class InternVLVideoEmbeddingInputs(TypedDict):
--    type: Literal["video_embeds"]
--    data: Union[torch.Tensor, list[torch.Tensor]]
--    """ 
--    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
--    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
- 
--    `hidden_size` must match the hidden size of language model backbone.
-+class InternVLVideoEmbeddingInputs(TensorSchema):
-+    """
-+    Dimensions:
-+        - n: Number of videos
-+        - f: Total video feature size
-+        - h: Hidden size (must match the hidden size of language model backbone)
-     """
-+    type: Literal["video_embeds"]
-+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-+                    TensorShape("n", "f", "h")]
- 
- 
- InternVLVideoInputs = Union[InternVLVideoPixelInputs,
-@@ -658,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
-     """Basic image-only ProcessingInfo for InternVL-style models."""
- 
-     @abstractmethod
--    def get_hf_processor(
--        self,
--        *,
--        min_dynamic_patch: Optional[int] = None,
--        max_dynamic_patch: Optional[int] = None,
--        dynamic_image_size: Optional[bool] = None,
--        **kwargs: object,
--    ) -> BaseInternVLProcessor:
-+    def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
-         raise NotImplementedError
- 
-     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-@@ -854,9 +854,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
+             assert isinstance(layer, Glm4MoeDecoderLayer)
+             if isinstance(layer.mlp, Glm4MoE):
+                 # Pick last one layer since the first ones may be dense layers.
+diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
+index bf3357585..d44dd9797 100644
+--- a/vllm/model_executor/models/glm4v.py
++++ b/vllm/model_executor/models/glm4v.py
+@@ -18,6 +18,7 @@ from transformers.image_utils import ImageInput
+ from transformers.tokenization_utils_base import TextInput
  
-     def get_video_token(self) -> Optional[str]:
-         text_model_type = self.get_hf_config().get_text_config().model_type
--        if text_model_type == "qwen2":
--            return "<|video_pad|>"
--        return None
-+        video_token_map = {
-+            "qwen2": "<|video_pad|>",
-+            "qwen3": "<|video_pad|>",
-+            "qwen3_moe": "<|video_pad|>",
-+            "gpt_oss": "<|reserved_200000|>",
-+        }
-+        return video_token_map.get(text_model_type)
+ from vllm.attention.layer import MultiHeadAttention
++from vllm.attention.layer import SelfMultiHeadAttention
+ from vllm.config import VllmConfig
+ from vllm.distributed import get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+@@ -119,7 +120,9 @@ class EVA2CLIPAttention(nn.Module):
+             prefix=f"{prefix}.dense",
+         )
  
-     def get_num_frames_with_most_features(
-         self,
-@@ -875,27 +879,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
+-        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
++        # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
++        #                                self.scale)
++        self.attn = SelfMultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                        self.scale)
+         self.output_dropout = torch.nn.Dropout(config.dropout_prob)
  
-         return max(max_frames_per_video, 1)
+diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
+index e0b4df772..d85d30d91 100644
+--- a/vllm/model_executor/models/gpt_oss.py
++++ b/vllm/model_executor/models/gpt_oss.py
+@@ -311,9 +311,6 @@ class GptOssModel(nn.Module):
+             if is_pp_missing_parameter(name, self):
+                 continue
  
--    def get_hf_processor(
--        self,
--        *,
--        min_dynamic_patch: Optional[int] = None,
--        max_dynamic_patch: Optional[int] = None,
--        dynamic_image_size: Optional[bool] = None,
--        **kwargs: object,
--    ) -> InternVLProcessor:
--        if min_dynamic_patch is not None:
--            kwargs["min_dynamic_patch"] = min_dynamic_patch
--        if max_dynamic_patch is not None:
--            kwargs["max_dynamic_patch"] = max_dynamic_patch
--        if dynamic_image_size is not None:
--            kwargs["dynamic_image_size"] = dynamic_image_size
--
--        kwargs["video_token"] = self.get_video_token()
+-            # FIXME(woosuk): Remove this after testing.
+-            weight = weight.cuda()
 -
-+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-         return self.ctx.init_processor(
-             InternVLProcessor,
-             config=self.get_hf_config(),
-             tokenizer=self.get_tokenizer(),
-+            video_token=self.get_video_token(),
-             **kwargs,
-         )
- 
-@@ -1151,26 +1140,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-         vit_embeds = self.mlp1(vit_embeds)
-         return vit_embeds
- 
--    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
--
--        h = w = self.config.vision_config.image_size
--        expected_dims = (3, h, w)
--
--        def _validate_shape(d: torch.Tensor):
--            actual_dims = tuple(d.shape)
--
--            if actual_dims != expected_dims:
--                expected_expr = str(expected_dims)
--                raise ValueError(
--                    "The expected shape of pixel values per image per batch "
--                    f" per patch is {expected_expr}. "
--                    f"You supplied {tuple(d.shape)}.")
--
--        for d in data:
--            _validate_shape(d)
--
--        return data
--
-     def _parse_and_validate_image_input(
-             self, **kwargs: object) -> Optional[InternVLImageInputs]:
-         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
-@@ -1205,12 +1174,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
- 
-             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
-             image_num_patches = flatten_bn(image_num_patches, concat=True)
-+            expected_h = expected_w = self.config.vision_config.image_size
-+            resolve_bindings = {"h": expected_h, "w": expected_w}
+             if ".w13_weight_scale" in name:
+                 # Handle MLP gate and up projection weights scale
+                 if use_ep:
+diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
+index 76737a442..4c337f4f1 100644
+--- a/vllm/model_executor/models/idefics2_vision_model.py
++++ b/vllm/model_executor/models/idefics2_vision_model.py
+@@ -27,7 +27,8 @@ from transformers.models.idefics2.configuration_idefics2 import (
+     Idefics2Config, Idefics2VisionConfig)
  
-             return InternVLImagePixelInputs(
-                 type="pixel_values",
--                pixel_values_flat=self._validate_pixel_values(
--                    pixel_values_flat),
-+                pixel_values_flat=pixel_values_flat,
-                 num_patches=image_num_patches,
-+                resolve_bindings=resolve_bindings,
-             )
+ from vllm.attention.layer import MultiHeadAttention
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.attention.layer import SelfMultiHeadAttention
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+@@ -154,9 +155,12 @@ class Idefics2VisionAttention(nn.Module):
+             prefix=f"{prefix}.out_proj",
+             disable_tp=use_data_parallel,
+         )
+-        # Use unified MultiHeadAttention with Flash Attention support
+-        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+-                                       self.head_dim, self.scale)
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++        # self.attn = MultiHeadAttention(self.num_heads_per_partition,
++        #                                self.head_dim, self.scale)
++        self.attn = SelfMultiHeadAttention(self.num_heads_per_partition, self.head_dim,
++                                       self.scale)
  
-         raise AssertionError("This line should be unreachable.")
-@@ -1225,11 +1196,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-             return None
+     def forward(
+         self,
+diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
+index 710b805ac..04824db1b 100644
+--- a/vllm/model_executor/models/keye.py
++++ b/vllm/model_executor/models/keye.py
+@@ -17,6 +17,7 @@ from transformers.modeling_outputs import (BaseModelOutput,
+                                            BaseModelOutputWithPooling)
+ from transformers.utils import torch_int
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import get_tensor_model_parallel_world_size
+ from vllm.logger import init_logger
+@@ -374,7 +375,16 @@ class KeyeSiglipAttention(nn.Module):
+         )
  
-         if video_embeds is not None:
--            if not isinstance(video_embeds, (torch.Tensor, list)):
--                raise ValueError("Incorrect type of video embeddings. "
--                                 f"Got type: {type(video_embeds)}")
--
--            return InternVLImageEmbeddingInputs(
-+            return InternVLVideoEmbeddingInputs(
-                 type="video_embeds",
-                 data=flatten_bn(video_embeds),
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.head_dim, dtype=torch.get_default_dtype())
++
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++
+         if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
+             raise RuntimeError(
+                 f"Keye-VL does not support {self.attn_backend} backend now.")
+@@ -428,7 +438,10 @@ class KeyeSiglipAttention(nn.Module):
              )
-@@ -1250,12 +1217,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-             pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
-                                                  concat=True)
-             video_num_patches = flatten_bn(video_num_patches, concat=True)
-+            expected_h = expected_w = self.config.vision_config.image_size
-+            resolve_bindings = {"h": expected_h, "w": expected_w}
  
-             return InternVLVideoPixelInputs(
-                 type="pixel_values_videos",
--                pixel_values_flat=self._validate_pixel_values(
--                    pixel_values_flat_video),
-+                pixel_values_flat=pixel_values_flat_video,
-                 num_patches=video_num_patches,
-+                resolve_bindings=resolve_bindings,
-             )
+         if self.attn_backend == _Backend.FLASH_ATTN:
+-            from flash_attn import flash_attn_varlen_func
++            if self.use_upstream_fa:
++                from flash_attn import flash_attn_varlen_func
++            else:
++                from vllm.vllm_flash_attn import flash_attn_varlen_func
  
-         raise AssertionError("This line should be unreachable.")
-@@ -1322,7 +1291,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
-         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-         if not modalities:
-             return []
--            return None
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
  
-         # The result multimodal_embeddings is tuple of tensors, with each
-         # tensor correspoending to a multimodal data item (image or video).
 diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
-index 48ec611df..09e6d8af7 100644
+index f8ea2111f..866b203a7 100644
 --- a/vllm/model_executor/models/llama.py
 +++ b/vllm/model_executor/models/llama.py
-@@ -437,6 +437,9 @@ class LlamaModel(nn.Module):
-             if "scale" in name:
-                 # Remapping the name of FP8 kv-scale.
-                 name = maybe_remap_kv_scale_name(name, params_dict)
-+                # temp fix for unit scale INC model, will can be removed
-+                if "proj.scale" in name and not "scales" in name:
-+                    name = name.replace("scale", "weight_scale")
-                 if name is None:
-                     continue
-             for param_name, weight_name, shard_id in stacked_params_mapping:
-diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
-index 70f2d4a64..816ea00d2 100644
---- a/vllm/model_executor/models/minicpmv.py
-+++ b/vllm/model_executor/models/minicpmv.py
-@@ -27,17 +27,21 @@ import math
- from collections import defaultdict
- from collections.abc import Iterable, Mapping, Sequence
- from functools import partial
-+from itertools import chain
- from typing import Any, Callable, Literal, Optional, TypedDict, Union
- 
- import numpy as np
- import torch
- import torch.types
- from torch import nn
-+from torch.nn.init import trunc_normal_
- from transformers import BatchFeature, PretrainedConfig
- from typing_extensions import TypeVar
- 
- from vllm.config import VllmConfig
- from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.layers.quantization.awq import AWQConfig
-+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
- from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
-                                                   get_2d_sincos_pos_embed)
- from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-@@ -45,6 +49,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
- from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
- from vllm.model_executor.models.module_mapping import MultiModelKeys
- from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
- from vllm.model_executor.sampling_metadata import SamplingMetadata
- from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
- from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-@@ -203,6 +208,187 @@ class Resampler2_5(BaseResampler):
-         return x
- 
- 
-+class Resampler4_5(Resampler2_5):
-+
-+    def __init__(self,
-+                 num_queries: int,
-+                 embed_dim: int,
-+                 num_heads: int,
-+                 kv_dim: Optional[int] = None,
-+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-+                 max_size: tuple[int, int] = (70, 70),
-+                 max_temporal_size: int = 36000,
-+                 quant_config: Optional[QuantizationConfig] = None,
-+                 prefix: str = "") -> None:
-+        super().__init__(num_queries,
-+                         embed_dim,
-+                         num_heads,
-+                         kv_dim,
-+                         norm_layer,
-+                         max_size,
-+                         quant_config=quant_config,
-+                         prefix=prefix)
-+
-+        trunc_normal_(self.query, std=.02)
-+        self.max_temporal_size = max_temporal_size
-+        self._set_temporal_pos_cache(self.max_temporal_size)
-+        self.apply(self._init_weights)
-+
-+    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
-+                                                   pos: np.ndarray):
-+        """
-+        embed_dim: output dimension for each position
-+        pos: a list of positions to be encoded: size (M,)
-+        out: (M, D)
-+        """
-+        assert embed_dim % 2 == 0
-+        omega = np.arange(embed_dim // 2, dtype=np.float32)
-+        omega /= embed_dim / 2.
-+        omega = 1. / 10000**omega  # (D/2,)
-+
-+        pos = pos.reshape(-1)  # (M,)
-+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-+
-+        emb_sin = np.sin(out)  # (M, D/2)
-+        emb_cos = np.cos(out)  # (M, D/2)
-+
-+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-+        return emb
-+
-+    def _set_temporal_pos_cache(self,
-+                                max_temporal_size: int,
-+                                device: torch.types.Device = "cpu") -> None:
-+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
-+        pos_embed = torch.from_numpy(
-+            self.get_1d_sincos_pos_embed_from_temporal_size(
-+                self.embed_dim, temporal_size)).float().to(device)
-+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
-+
-+    def _adjust_temporal_pos_cache(self,
-+                                   max_temporal_size: int,
-+                                   device: torch.types.Device = "cpu"):
-+        if max_temporal_size > self.max_temporal_size:
-+            self.max_temporal_size = max_temporal_size
-+            self._set_temporal_pos_cache(self.max_temporal_size, device)
-+
-+    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
-+        if isinstance(m, nn.Linear):
-+            trunc_normal_(m.weight, std=.02)
-+            if isinstance(m, nn.Linear) and m.bias is not None:
-+                nn.init.constant_(m.bias, 0)
-+        elif isinstance(m, nn.LayerNorm):
-+            nn.init.constant_(m.bias, 0)
-+            nn.init.constant_(m.weight, 1.0)
-+
-+    def forward(
-+        self,
-+        x: torch.Tensor,
-+        tgt_sizes: torch.Tensor,
-+        # temporal_ids for high refresh rate videos
-+        temporal_ids=None
-+    ) -> torch.Tensor:
-+        assert x.shape[0] == tgt_sizes.shape[0]
-+        bs = x.shape[0]
-+
-+        device = x.device
-+        dtype = x.dtype
-+
-+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
-+
-+        self._adjust_pos_cache(tgt_sizes, device=device)
-+
-+        temporal_pos_emb = False
-+        temporal_ids_flatten = None
-+        if temporal_ids is not None:
-+            # example: [[-1], [-1], [2, 6, 9]]
-+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
-+            max_temporal_size = max(temporal_ids_flatten, default=0)
-+            if max_temporal_size > -1:
-+                temporal_pos_emb = True
-+            if max_temporal_size > self.max_temporal_size:
-+                self._adjust_temporal_pos_cache(max_temporal_size, device)
-+
-+        max_patch_len = patch_len.max().item()
-+        assert isinstance(max_patch_len, int)
-+
-+        key_padding_mask = torch.zeros((bs, max_patch_len),
-+                                       dtype=torch.bool,
-+                                       device=device)
-+
-+        x, _ = self.kv_proj(x)  # B * L * D
-+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
-+        q = self.ln_q(self.query)  # Q * D
-+
-+        pos_embed_2d = []
-+        pos_embed_temporal = []
-+        for i in range(bs):
-+            tgt_h, tgt_w = tgt_sizes[i]
-+            if temporal_pos_emb:
-+                if temporal_ids_flatten[i] == -1:
-+                    pos_embed_temporal.append(
-+                        torch.zeros(self.embed_dim, dtype=dtype,
-+                                    device=device))
-+                else:
-+                    pos_embed_temporal.append(self.temporal_pos_embed[
-+                        temporal_ids_flatten[i]].to(dtype))  # D
-+
-+            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
-+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
-+            key_padding_mask[i, patch_len[i]:] = True
-+
-+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
-+            pos_embed_2d, batch_first=True,
-+            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
-+
-+        k = x
-+        v = x + pos_embed_2d
-+        if pos_embed_temporal:
-+            k += torch.stack(pos_embed_temporal, dim=0)
-+            bs = len(temporal_ids)
-+            merge_k = []
-+            merge_v = []
-+            merge_key_padding_mask = []
-+
-+            start = 0
-+            for tp in temporal_ids:
-+                end = start + len(tp)
-+                # L * (end-start) * D -> (end-start) * L * D
-+                # -> 1 * L*(end-start) * D
-+                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
-+                    -1, self.embed_dim))
-+                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
-+                    -1, self.embed_dim))
-+                merge_key_padding_mask.append(
-+                    key_padding_mask[start:end, :].reshape(-1, 1))
-+
-+                start = end
-+
-+            k = torch.nn.utils.rnn.pad_sequence(merge_k,
-+                                                batch_first=True,
-+                                                padding_value=0.0).permute(
-+                                                    1, 0, 2)  # L*(end-start)
-+            v = torch.nn.utils.rnn.pad_sequence(merge_v,
-+                                                batch_first=True,
-+                                                padding_value=0.0).permute(
-+                                                    1, 0, 2)  # L*(end-start)
-+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
-+                merge_key_padding_mask, batch_first=True,
-+                padding_value=True).squeeze(-1)
-+
-+        out = self.attn(
-+            self._repeat(q, bs),  # Q * B * D
-+            k,  # L * B * D +  L * B * D
-+            v,
-+            key_padding_mask=key_padding_mask,
-+        )[0]
-+        #  out: Q * B * D
-+        x = out.permute(1, 0, 2)  # B * Q * D
-+
-+        x = self.ln_post(x)
-+        x = x @ self.proj
-+        return x
-+
-+
- def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
-     version_float = getattr(config, "version", None)
- 
-@@ -331,17 +517,15 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
- 
-         return hf_processor
- 
--    def get_image_processor(self):
--        hf_processor = self.get_hf_processor()
--        image_processor = hf_processor.image_processor  # type: ignore
--        return image_processor
-+    def get_image_processor(self, **kwargs: object):
-+        return self.get_hf_processor(**kwargs).image_processor
- 
-     def get_model_version(self):
-         return get_version_by_config(self.get_hf_config())
- 
-     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-         mm_limits = {"image": None}
--        if self.get_model_version() == (2, 6):
-+        if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
-             mm_limits["video"] = None
- 
-         return mm_limits
-@@ -622,7 +806,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
-         out_keys: set[str],
-     ) -> dict[str, NestedTensors]:
-         # This processor supports zipping prompt and mm_data together
--        if self.info.get_model_version() == (2, 6):
-+        if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
-             inputs = super()._call_hf_processor(
-                 prompt=prompts,  # type: ignore
-                 mm_data=mm_data,
-@@ -681,10 +865,18 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
-         hf_processor_mm_kwargs: Mapping[str, object],
-         out_mm_kwargs: MultiModalKwargs,
-     ) -> Sequence[PromptUpdate]:
--        placeholder = {
--            "image": self.info.image_pattern,
--            "video": self.info.video_pattern,
--        }
-+        placeholders = [("image", self.info.image_pattern),
-+                        ("video", self.info.video_pattern)]
-+
-+        # hard code for inconsistency of encode-decode image_pattern
-+        additional_placeholders = []
-+        tokenizer = self.info.get_tokenizer()
-+        for modality, pattern in placeholders:
-+            sub_pattern = tokenizer.decode(
-+                tokenizer.encode(pattern, add_special_tokens=False))
-+            if sub_pattern != pattern:
-+                additional_placeholders.append((modality, sub_pattern))
-+        placeholders += additional_placeholders
- 
-         def get_image_replacement(item_idx: int):
-             images = mm_items.get_items(
-@@ -716,9 +908,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
- 
-         return [
-             PromptReplacement(modality=modality,
--                              target=placeholder[modality],
-+                              target=pattern,
-                               replacement=get_replacement[modality])
--            for modality in ("image", "video")
-+            for modality, pattern in placeholders
-         ]
- 
-     def _get_mm_fields_config(
-@@ -1264,11 +1456,234 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
- 
-         return self.resampler(vision_embedding, tgt_sizes)
- 
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
-+
-+class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
-+
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        assert self.version == (4, 0)
-+
-+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-+            return None
-+        return quant_config
-+
-+    def init_llm(
-+        self,
-+        vllm_config: VllmConfig,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
-+
-+    def init_vision_module(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        model = Idefics2VisionTransformer(config.vision_config,
-+                                          quant_config=quant_config,
-+                                          prefix=prefix)
-+        if self.config.drop_vision_last_layer:
-+            model.encoder.layers = model.encoder.layers[:-1]
-+        return model
-+
-+    def init_resampler(
-+        self,
-+        embed_dim: int,
-+        vision_dim: int,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        with set_default_torch_dtype(torch.float16):
-+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-+            resampler = Resampler2_5(num_queries=self.config.query_num,
-+                                     embed_dim=embed_dim,
-+                                     num_heads=embed_dim // 128,
-+                                     kv_dim=vision_dim,
-+                                     quant_config=quant_config,
-+                                     prefix=prefix)
-+
-+        return resampler.to(device=current_platform.device_type,
-+                            dtype=torch.get_default_dtype())
-+
-+    def get_vision_hidden_states(
-+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
-+        pixel_values = data["pixel_values"]
-+        tgt_sizes = data["tgt_sizes"]
-+
-+        B = len(pixel_values)
-+        P = pixel_values[0].shape[-2]
-+        L = max(item.shape[-1] for item in pixel_values)
-+        device = pixel_values[0].device
-+        dtype = pixel_values[0].dtype
-+
-+        all_pixel_values = torch.zeros((B, 3, P, L),
-+                                       dtype=dtype,
-+                                       device=device)
-+        for i, pixel_values_item in enumerate(pixel_values):
-+            L_item = pixel_values_item.shape[-1]
-+            all_pixel_values[i, ..., :L_item] = pixel_values_item
-+
-+        num_patches = tgt_sizes.prod(-1)
-+        max_patches = num_patches.max().item()
-+        assert isinstance(max_patches, int)
-+
-+        patch_attn_mask = torch.zeros((B, max_patches),
-+                                      dtype=torch.bool,
-+                                      device=device)
-+        for i, num_patches_item in enumerate(num_patches):
-+            patch_attn_mask[i, :num_patches_item] = True
-+
-+        vision_embedding = self.vpm(
-+            all_pixel_values,
-+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
-+            tgt_sizes=tgt_sizes,
-+        )
-+
-+        return self.resampler(vision_embedding, tgt_sizes)
-+
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
-+
-+class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
-+
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        assert self.version == (4, 5)
-+
-+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
-+            return None
-+        return quant_config
-+
-+    def init_llm(
-+        self,
-+        vllm_config: VllmConfig,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
-+
-+    def init_vision_module(
-+        self,
-+        config: PretrainedConfig,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        model = Idefics2VisionTransformer(config.vision_config,
-+                                          quant_config=quant_config,
-+                                          prefix=prefix)
-+        if self.config.drop_vision_last_layer:
-+            model.encoder.layers = model.encoder.layers[:-1]
-+        return model
-+
-+    def init_resampler(
-+        self,
-+        embed_dim: int,
-+        vision_dim: int,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> nn.Module:
-+        quant_config = self._maybe_ignore_quant_config(quant_config)
-+        with set_default_torch_dtype(torch.float16):
-+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
-+            resampler = Resampler4_5(num_queries=self.config.query_num,
-+                                     embed_dim=embed_dim,
-+                                     num_heads=embed_dim // 128,
-+                                     kv_dim=vision_dim,
-+                                     quant_config=quant_config,
-+                                     prefix=prefix)
-+
-+        return resampler.to(device=current_platform.device_type,
-+                            dtype=torch.get_default_dtype())
-+
-+    def get_vision_hidden_states(
-+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
-+        pixel_values = data["pixel_values"]
-+        tgt_sizes = data["tgt_sizes"]
-+        temporal_ids = data.get('temporal_ids', None)
-+
-+        B = len(pixel_values)
-+        P = pixel_values[0].shape[-2]
-+        L = max(item.shape[-1] for item in pixel_values)
-+        device = pixel_values[0].device
-+        dtype = pixel_values[0].dtype
-+
-+        all_pixel_values = torch.zeros((B, 3, P, L),
-+                                       dtype=dtype,
-+                                       device=device)
-+        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
-+            temporal_ids)
-+        for i, pixel_values_item in enumerate(pixel_values):
-+            L_item = pixel_values_item.shape[-1]
-+            all_pixel_values[i, ..., :L_item] = pixel_values_item
-+
-+        num_patches = tgt_sizes.prod(-1)
-+        max_patches = num_patches.max().item()
-+        assert isinstance(max_patches, int)
-+
-+        patch_attn_mask = torch.zeros((B, max_patches),
-+                                      dtype=torch.bool,
-+                                      device=device)
-+        for i, num_patches_item in enumerate(num_patches):
-+            patch_attn_mask[i, :num_patches_item] = True
-+
-+        vision_embedding = self.vpm(
-+            all_pixel_values,
-+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
-+            tgt_sizes=tgt_sizes,
-+        )
-+
-+        return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
-+
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(self,
-+                                   skip_prefixes=["apm.", "audio", "tts"])
-+        return loader.load_weights(weights)
-+
- 
- _SUPPORT_VERSION = {
-     (2, 0): MiniCPMV2_0,
-     (2, 5): MiniCPMV2_5,
-     (2, 6): MiniCPMV2_6,
-+    (4, 0): MiniCPMV4_0,
-+    (4, 5): MiniCPMV4_5,
- }
- 
- 
-@@ -1296,8 +1711,10 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
-         # Dispatch class based on version
-         instance_cls = _SUPPORT_VERSION.get(version)
-         if instance_cls is None:
--            raise ValueError(
--                "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-+            supported_versions = ", ".join(
-+                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())])
-+            raise ValueError(f"Currently, MiniCPMV only supports versions "
-+                             f"{supported_versions}. Got version: {version}")
- 
-         # quant_config references base class members,
-         # so update values before init is called
+@@ -344,6 +344,8 @@ class LlamaModel(nn.Module):
+         self.quant_config = quant_config
+         lora_vocab = (lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0
++        if "text_config" in config:
++            config = config.text_config
+         self.vocab_size = config.vocab_size + lora_vocab
+         self.org_vocab_size = config.vocab_size
+         if get_pp_group().is_first_rank or (config.tie_word_embeddings
 diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
-index 0b0d66ae7..ab54409e5 100644
+index a1c452053..a74e8cdb7 100644
 --- a/vllm/model_executor/models/phi4mm_audio.py
 +++ b/vllm/model_executor/models/phi4mm_audio.py
-@@ -535,9 +535,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
+@@ -550,10 +550,11 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
+         enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
                                                    self.chunk_size,
                                                    self.left_chunk)
- 
+-
 -        if xs_pad.is_cuda:
 -            enc_streaming_mask = enc_streaming_mask.cuda()
 -            xs_pad = xs_pad.cuda()
++        
 +        device = xs_pad.device
 +        if device.type != "cpu":
 +            enc_streaming_mask = enc_streaming_mask.to(device)
@@ -15048,7 +12139,7 @@ index 0b0d66ae7..ab54409e5 100644
  
          input_tensor = xs_pad
          input_tensor, masks = self._forward_embeddings_core(
-@@ -554,8 +555,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
+@@ -570,8 +571,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
          if chunk_size_nc is not None:
              enc_streaming_mask_nc = self._streaming_mask(
                  seq_len, batch_size, chunk_size_nc, left_chunk_nc)
@@ -15059,22 +12150,60 @@ index 0b0d66ae7..ab54409e5 100644
              if masks is not None:
                  hs_mask_nc = masks & enc_streaming_mask_nc
              else:
+diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
+index 54dc0bebd..e13e87b93 100644
+--- a/vllm/model_executor/models/qwen2.py
++++ b/vllm/model_executor/models/qwen2.py
+@@ -285,7 +285,7 @@ class Qwen2Model(nn.Module):
+                  decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
+         super().__init__()
+ 
+-        config = vllm_config.model_config.hf_config
++        config = vllm_config.model_config.hf_config.get_text_config()
+         cache_config = vllm_config.cache_config
+         quant_config = vllm_config.quant_config
+ 
 diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
-index 8ae096536..472772ce6 100644
+index 8aa777557..bb4a8bbce 100644
 --- a/vllm/model_executor/models/qwen2_5_vl.py
 +++ b/vllm/model_executor/models/qwen2_5_vl.py
-@@ -249,8 +249,9 @@ class Qwen2_5_VisionAttention(nn.Module):
+@@ -38,6 +38,7 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state
+ from vllm.distributed import utils as dist_utils
+@@ -298,10 +299,13 @@ class Qwen2_5_VisionAttention(nn.Module):
+                                       disable_tp=use_data_parallel)
  
          # Detect attention implementation.
-         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++
          if self.attn_backend not in {
--                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.IPEX
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+-                _Backend.ROCM_AITER_FA
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
          }:
              raise RuntimeError(
                  f"Qwen2.5-VL does not support {self.attn_backend} backend now."
-@@ -321,24 +322,86 @@ class Qwen2_5_VisionAttention(nn.Module):
+@@ -359,7 +363,10 @@ class Qwen2_5_VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
+ 
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+ 
+@@ -376,24 +383,86 @@ class Qwen2_5_VisionAttention(nn.Module):
              context_layer = rearrange(output,
                                        "(b s) ... -> b s ...",
                                        b=batch_size)
@@ -15100,7 +12229,7 @@ index 8ae096536..472772ce6 100644
 +                    pdropout=0.0,
 +                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
 +                    zero_tensors=False,
-+                    is_causal=True,
++                    is_causal=False,
 +                    return_softmax=False,
 +                    gen_=None,
 +                    window_size_left=-1,
@@ -15177,7 +12306,21 @@ index 8ae096536..472772ce6 100644
          elif self.attn_backend == _Backend.XFORMERS:
              from xformers import ops as xops
              from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -643,6 +706,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
+@@ -628,7 +697,12 @@ class Qwen2_5_VisionTransformer(nn.Module):
+             prefix=f"{prefix}.merger",
+             use_data_parallel=use_data_parallel,
+         )
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -714,6 +788,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
              max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
          elif self.attn_backend == _Backend.XFORMERS:
              seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -15185,8 +12328,8 @@ index 8ae096536..472772ce6 100644
 +            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
          return max_seqlen, seqlens
  
-     def forward(
-@@ -1132,10 +1197,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+     @staticmethod
+@@ -1210,10 +1286,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
              if image_input is None and video_input is None:
                  inputs_embeds = None
              else:
@@ -15202,21 +12345,55 @@ index 8ae096536..472772ce6 100644
                      input_ids,
                      image_input=image_input,
 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
-index ad63bb4af..2e22799e1 100644
+index 90a1ad2a6..8ba467034 100644
 --- a/vllm/model_executor/models/qwen2_vl.py
 +++ b/vllm/model_executor/models/qwen2_vl.py
-@@ -275,8 +275,9 @@ class Qwen2VisionAttention(nn.Module):
+@@ -41,6 +41,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+ from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
+     Qwen2VLVideoProcessor)
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import VllmConfig
+ from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+ from vllm.distributed import utils as dist_utils
+@@ -82,7 +83,7 @@ from .vision import get_vit_attn_backend
+ logger = init_logger(__name__)
+ 
+ # For profile run
+-_MAX_FRAMES_PER_VIDEO = 16
++_MAX_FRAMES_PER_VIDEO = 600
+ 
+ # === Vision Inputs === #
+ 
+@@ -314,10 +315,13 @@ class Qwen2VisionAttention(nn.Module):
+                                       prefix=f"{prefix}.proj")
  
          # Detect attention implementation.
-         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-+        self.attn_backend = _Backend.TORCH_SDPA
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.hidden_size_per_attention_head,
++            dtype=torch.get_default_dtype())
++
          if self.attn_backend not in {
--                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
-+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.IPEX
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+-                _Backend.ROCM_AITER_FA
++                _Backend.ROCM_AITER_FA, _Backend.IPEX
          }:
              raise RuntimeError(
                  f"Qwen2-VL does not support {self.attn_backend} backend now.")
-@@ -346,24 +347,69 @@ class Qwen2VisionAttention(nn.Module):
+@@ -374,7 +378,10 @@ class Qwen2VisionAttention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
+ 
+             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+ 
+@@ -391,24 +398,69 @@ class Qwen2VisionAttention(nn.Module):
              context_layer = rearrange(output,
                                        "(b s) ... -> b s ...",
                                        b=batch_size)
@@ -15242,7 +12419,7 @@ index ad63bb4af..2e22799e1 100644
 +                    pdropout=0.0,
 +                    softmax_scale=1.0/(q.shape[-1] ** 0.5),
 +                    zero_tensors=False,
-+                    is_causal=True,
++                    is_causal=False,
 +                    return_softmax=False,
 +                    gen_=None,
 +                    window_size_left=-1,
@@ -15302,7 +12479,21 @@ index ad63bb4af..2e22799e1 100644
          elif self.attn_backend == _Backend.XFORMERS:
              from xformers import ops as xops
              from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-@@ -626,6 +672,8 @@ class Qwen2VisionTransformer(nn.Module):
+@@ -628,7 +680,12 @@ class Qwen2VisionTransformer(nn.Module):
+             quant_config=quant_config,
+             prefix=f"{prefix}.merger",
+         )
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -672,6 +729,8 @@ class Qwen2VisionTransformer(nn.Module):
              max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
          elif self.attn_backend == _Backend.XFORMERS:
              seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
@@ -15312,10 +12503,10 @@ index ad63bb4af..2e22799e1 100644
  
      def forward(
 diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
-index 393ce41a9..550c5b273 100644
+index dddb47048..0e8bf0d32 100644
 --- a/vllm/model_executor/models/qwen3.py
 +++ b/vllm/model_executor/models/qwen3.py
-@@ -135,11 +135,11 @@ class Qwen3Attention(nn.Module):
+@@ -147,11 +147,11 @@ class Qwen3Attention(nn.Module):
          # Add qk-norm
          q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
                             self.head_dim)
@@ -15330,207 +12521,3028 @@ index 393ce41a9..550c5b273 100644
          q, k = self.rotary_emb(positions, q, k)
          attn_output = self.attn(q, k, v)
 diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
-index 12899c280..951215ee0 100644
+index 85429b3a0..b4336c633 100644
 --- a/vllm/model_executor/models/qwen3_moe.py
 +++ b/vllm/model_executor/models/qwen3_moe.py
-@@ -225,12 +225,12 @@ class Qwen3MoeAttention(nn.Module):
+@@ -277,12 +277,14 @@ class Qwen3MoeAttention(nn.Module):
          # Add qk-norm
          q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
                             self.head_dim)
 -        q_by_head = self.q_norm(q_by_head)
 +        q_by_head = self.q_norm.forward(q_by_head.contiguous())
++        # q_by_head = self.q_norm(q_by_head)
          q = q_by_head.view(q.shape)
  
          k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
                             self.head_dim)
 -        k_by_head = self.k_norm(k_by_head)
 +        k_by_head = self.k_norm.forward(k_by_head.contiguous())
++        # k_by_head = self.k_norm(k_by_head)
          k = k_by_head.view(k.shape)
          q, k = self.rotary_emb(positions, q, k)
          attn_output = self.attn(q, k, v)
-diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
-index 2aaac7798..6a832ca27 100644
---- a/vllm/model_executor/models/registry.py
-+++ b/vllm/model_executor/models/registry.py
-@@ -122,11 +122,13 @@ _TEXT_GENERATION_MODELS = {
-     "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
-     "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
-     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
-+    "SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"),
-     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
-     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
-     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
-     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
-     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
-+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
-     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
-     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
-     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
-diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
-index c6b411644..feb549d44 100644
---- a/vllm/model_executor/models/roberta.py
-+++ b/vllm/model_executor/models/roberta.py
-@@ -9,6 +9,7 @@ from torch import nn
- from transformers import RobertaConfig
- 
- from vllm.config import VllmConfig
-+from vllm.forward_context import get_forward_context
- from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
-                                                DispatchPooler, Pooler)
- from vllm.model_executor.layers.vocab_parallel_embedding import (
-@@ -19,7 +20,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
- from vllm.sequence import IntermediateTensors
- 
- from .bert_with_rope import BertWithRope, JinaRobertaModel
--from .interfaces import SupportsCrossEncoding, SupportsV0Only
-+from .interfaces import SupportsCrossEncoding
- 
- 
- class RobertaEmbedding(nn.Module):
-@@ -51,33 +52,12 @@ class RobertaEmbedding(nn.Module):
-     def forward(
-         self,
-         input_ids: torch.Tensor,
--        seq_lens: torch.Tensor,
-         position_ids: torch.Tensor,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-         input_shape = input_ids.size()
-         inputs_embeds = self.word_embeddings(input_ids)
- 
--        # Replace position ids because in RoBERTa models
--        # they have to start at padding_idx + 1 and ignore
--        # existing padding tokens
--        # References:
--        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
--        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
--        seq_lens_list = seq_lens.tolist()
--        new_pos_list = []
--        for positions, tokens in zip(position_ids.split(seq_lens_list),
--                                     input_ids.split(seq_lens_list)):
--            # Verify assumption that incoming position are
--            # always a sequence from 0 to N.
--            expected_pos = torch.arange(positions.size()[0],
--                                        dtype=torch.long,
--                                        device=inputs_embeds.device)
--            assert torch.equal(positions, expected_pos)
--            new_pos_list.append(
--                create_position_ids_from_input_ids(tokens, self.padding_idx))
--        position_ids = torch.cat(new_pos_list)
--
-         # Position embeddings.
-         position_embeddings = self.position_embeddings(position_ids)
-         if token_type_ids is None:
-@@ -119,6 +99,32 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
-        _pooler: An instance of Pooler used for pooling operations.
-    """
- 
-+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__(vllm_config=vllm_config, prefix=prefix)
-+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
-+
-+    def forward(
-+        self,
-+        input_ids: Optional[torch.Tensor],
-+        positions: torch.Tensor,
-+        token_type_ids: Optional[torch.Tensor] = None,
-+        intermediate_tensors: Optional[IntermediateTensors] = None,
-+        inputs_embeds: Optional[torch.Tensor] = None,
-+    ) -> torch.Tensor:
-+
-+        # Fix Roberta positions here outside of the CUDA graph.
-+        # Because we need the to extract the sequences from
-+        # input_ids the control flow is data dependent.
-+        replace_roberta_positions(input_ids=input_ids,
-+                                  position_ids=positions,
-+                                  padding_idx=self.padding_idx)
-+
-+        return self.model(input_ids=input_ids,
-+                          position_ids=positions,
-+                          token_type_ids=token_type_ids,
-+                          inputs_embeds=inputs_embeds,
-+                          intermediate_tensors=intermediate_tensors)
-+
-     def _build_model(self,
-                      vllm_config: VllmConfig,
-                      prefix: str = "") -> Union[BertModel, BertWithRope]:
-@@ -147,8 +153,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
-         return loader.load_weights(weights_list, mapper=mapper)
- 
- 
--class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
--                                       SupportsV0Only):
-+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
-     """A model that uses Roberta to provide embedding functionalities.
- 
-    This class encapsulates the BertModel and provides an interface for
-@@ -175,6 +180,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
+@@ -378,7 +380,7 @@ class Qwen3MoeModel(nn.Module):
      def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
          super().__init__()
-         config = vllm_config.model_config.hf_config
-+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
- 
-         self.num_labels = config.num_labels
-         self.roberta = BertModel(vllm_config=vllm_config,
-@@ -216,6 +222,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
-         inputs_embeds: Optional[torch.Tensor] = None,
-         token_type_ids: Optional[torch.Tensor] = None,
-     ) -> torch.Tensor:
-+        replace_roberta_positions(input_ids=input_ids,
-+                                  position_ids=positions,
-+                                  padding_idx=self.padding_idx)
-         return self.roberta(input_ids=input_ids,
-                             position_ids=positions,
-                             inputs_embeds=inputs_embeds,
-@@ -245,3 +254,36 @@ def create_position_ids_from_input_ids(input_ids,
-                            past_key_values_length) * mask
  
-     return incremental_indices.long() + padding_idx
-+
-+
-+def replace_roberta_positions(input_ids: torch.Tensor,
-+                              position_ids: torch.Tensor,
-+                              padding_idx: int) -> None:
-+
-+    seq_lens: Optional[torch.Tensor] = None
-+    attn_metadata = get_forward_context().attn_metadata
-+    if attn_metadata is not None:  # can be None during warmup
-+        if isinstance(attn_metadata, dict):
-+            attn_metadata = next(iter(attn_metadata.values()))
-+        # TODO: remove "seq_lens_tensor" after V0 is removed
-+        seq_lens = getattr(attn_metadata, "seq_lens_tensor",
-+                           getattr(attn_metadata, "seq_lens", None))
-+
-+    if seq_lens is not None:
-+        assert isinstance(seq_lens, torch.Tensor)
-+
-+        # Replace position ids because in RoBERTa models
-+        # they have to start at padding_idx + 1 and ignore
-+        # existing padding tokens
-+        # References:
-+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-+        token_list = torch.split(input_ids[:torch.sum(seq_lens)],
-+                                 seq_lens.tolist())
-+
-+        offset = 0
-+        for tokens in token_list:
-+            length = tokens.shape[0]
-+            position_ids[offset:offset+length] = \
-+                create_position_ids_from_input_ids(tokens, padding_idx)
-+            offset = offset + length
-diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
+-        config = vllm_config.model_config.hf_config
++        config = vllm_config.model_config.hf_config.get_text_config()
+         cache_config = vllm_config.cache_config
+         quant_config = vllm_config.quant_config
+         parallel_config = vllm_config.parallel_config
+diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+new file mode 100755
+index 000000000..d388da846
+--- /dev/null
++++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+@@ -0,0 +1,1490 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++# Copyright 2025 The Qwen team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen3-Omni-Moe model (thinker part)."""
++
++from collections.abc import Iterable, Mapping, Sequence
++from functools import partial
++from typing import Any, Callable, Optional, Union
++
++import numpy as np
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import PretrainedConfig
++from transformers.feature_extraction_utils import BatchFeature
++from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import (
++    Qwen3OmniMoeConfig,
++    Qwen3OmniMoeThinkerConfig,
++)
++from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
++    Qwen3OmniMoeAudioEncoder,
++)
++from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
++    Qwen3OmniMoeProcessor,
++)
++from transformers.models.whisper import WhisperFeatureExtractor
++
++# from vllm.attention.backends.registry import _Backend
++from vllm.platforms.interface import _Backend
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.qwen2_audio import (
++    Qwen2AudioFeatureInputs,
++    Qwen2AudioProcessingInfo,
++)
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import MultiModalKwargsItems
++from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
++from vllm.multimodal.processing import (
++    BaseMultiModalProcessor,
++    MultiModalPromptUpdates,
++    PlaceholderFeaturesInfo,
++    PromptReplacement,
++    PromptUpdate,
++)
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
++from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
++
++# yapf conflicts with isort for this block
++# yapf: disable
++from .qwen2_5_omni_thinker import (
++    Qwen2_5OmniConditionalGenerationMixin,
++    Qwen2_5OmniThinkerDummyInputsBuilder,
++    Qwen2_5OmniThinkerMultiModalProcessor,
++    Qwen2_5OmniThinkerProcessingInfo,
++)
++
++# yapf: enable
++from .qwen2_5_vl import (
++    Qwen2_5_VisionAttention,
++    Qwen2_5_VisionRotaryEmbedding,
++    Qwen2_5_VLProcessingInfo,
++)
++from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
++from .utils import (
++    AutoWeightsLoader,
++    WeightsMapper,
++    _merge_multimodal_embeddings,
++    merge_multimodal_embeddings,
++    maybe_prefix,
++)
++from .vision import get_vit_attn_backend
++
++try:
++    import flash_attn
++except (ImportError, ModuleNotFoundError):
++    flash_attn = None
++
++logger = init_logger(__name__)
++
++
++class Qwen3_VisionPatchEmbed(nn.Module):
++    def __init__(
++        self,
++        patch_size: int = 14,
++        temporal_patch_size: int = 2,
++        in_channels: int = 3,
++        hidden_size: int = 1152,
++    ) -> None:
++        super().__init__()
++        self.patch_size = patch_size
++        self.temporal_patch_size = temporal_patch_size
++        self.hidden_size = hidden_size
++
++        kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.proj = nn.Conv3d(
++            in_channels,
++            hidden_size,
++            kernel_size=kernel_size,
++            stride=kernel_size,
++            bias=True,
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        L, C = x.shape
++        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
++        x = self.proj(x).view(L, self.hidden_size)
++        return x
++
++
++class Qwen3_VisionMLP(nn.Module):
++    def __init__(
++        self,
++        in_features: int,
++        hidden_features: int,
++        bias: bool = False,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.linear_fc1 = ColumnParallelLinear(
++            in_features,
++            hidden_features,
++            bias=bias,
++            quant_config=quant_config,
++            return_bias=False,
++            prefix=f"{prefix}.linear_fc1",
++        )
++        self.linear_fc2 = RowParallelLinear(
++            hidden_features,
++            in_features,
++            bias=bias,
++            quant_config=quant_config,
++            return_bias=False,
++            prefix=f"{prefix}.linear_fc2",
++        )
++        self.act_fn = act_fn
++
++    def forward(self, x: torch.Tensor):
++        mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
++        return mlp_output
++
++
++class Qwen3_VisionBlock(nn.Module):
++    def __init__(
++        self,
++        dim: int,
++        num_heads: int,
++        mlp_hidden_dim: int,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.norm1 = norm_layer(dim)
++        self.norm2 = norm_layer(dim)
++        self.attn = Qwen2_5_VisionAttention(
++            embed_dim=dim,
++            num_heads=num_heads,
++            projection_size=dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++        self.mlp = Qwen3_VisionMLP(
++            dim,
++            mlp_hidden_dim,
++            act_fn=act_fn,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        cu_seqlens: torch.Tensor,
++        rotary_pos_emb: torch.Tensor,
++        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
++        seqlens: Optional[list[int]] = None,  # Only used for xFormers
++    ) -> torch.Tensor:
++        x = x + self.attn(
++            self.norm1(x),
++            cu_seqlens=cu_seqlens,
++            rotary_pos_emb=rotary_pos_emb,
++            max_seqlen=max_seqlen,
++            seqlens=seqlens,
++        )
++
++        x = x + self.mlp(self.norm2(x))
++        return x
++
++
++class Qwen3_VisionPatchMerger(nn.Module):
++    def __init__(
++        self,
++        d_model: int,
++        context_dim: int,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        spatial_merge_size: int = 2,
++        use_postshuffle_norm: bool = False,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++
++        self.use_postshuffle_norm = use_postshuffle_norm
++        if self.use_postshuffle_norm:
++            context_dim = self.hidden_size
++
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.use_postshuffle_norm = use_postshuffle_norm
++        self.ln_q = norm_layer(
++            self.hidden_size if use_postshuffle_norm else context_dim
++        )
++        self.mlp = nn.ModuleList(
++            [
++                ColumnParallelLinear(
++                    self.hidden_size,
++                    self.hidden_size,
++                    bias=True,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.mlp.0",
++                ),
++                nn.GELU(),
++                RowParallelLinear(
++                    self.hidden_size,
++                    d_model,
++                    bias=True,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.mlp.2",
++                ),
++            ]
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.use_postshuffle_norm:
++            x = self.ln_q(x.view(-1, self.hidden_size))
++        else:
++            x = self.ln_q(x).view(-1, self.hidden_size)
++
++        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
++        x_parallel, _ = mlp_fc1(x)
++        x_parallel = mlp_act(x_parallel)
++        out, _ = mlp_fc2(x_parallel)
++        return out
++
++
++class Qwen3Omni_VisionTransformer(nn.Module):
++    def __init__(
++        self,
++        vision_config,
++        norm_eps: float = 1e-6,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = vision_config.hidden_size
++        self.num_heads = vision_config.num_heads
++        self.image_size = vision_config.image_size
++        self.patch_size = vision_config.patch_size
++        self.spatial_merge_size = vision_config.spatial_merge_size
++        self.spatial_merge_unit = self.spatial_merge_size**2
++        self.temporal_patch_size = vision_config.temporal_patch_size
++        self.num_grid_per_side = self.image_size // self.patch_size
++        self.apply_vit_abs_pos_embed = vision_config.apply_vit_abs_pos_embed
++        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
++
++        self.patch_embed = Qwen3_VisionPatchEmbed(
++            patch_size=self.patch_size,
++            temporal_patch_size=self.temporal_patch_size,
++            in_channels=vision_config.in_channels,
++            hidden_size=self.hidden_size,
++        )
++
++        # vit pos embeding, TODO: spatial_patch_size vs patch_size
++        if self.apply_vit_abs_pos_embed:
++            self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size)
++        else:
++            self.pos_embed = nn.Parameter(
++                torch.empty([1, self.num_grid_per_side**2, self.hidden_size])
++            )
++
++        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
++        head_dim = self.hidden_size // self.num_heads
++        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
++
++        self.blocks = nn.ModuleList(
++            [
++                Qwen3_VisionBlock(
++                    dim=self.hidden_size,
++                    num_heads=self.num_heads,
++                    mlp_hidden_dim=vision_config.intermediate_size,
++                    act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
++                    norm_layer=norm_layer,
++                    quant_config=quant_config,
++                    prefix=f"{prefix}.blocks.{layer_idx}",
++                )
++                for layer_idx in range(vision_config.depth)
++            ]
++        )
++        self.merger = Qwen3_VisionPatchMerger(
++            d_model=vision_config.out_hidden_size,
++            context_dim=self.hidden_size,
++            norm_layer=norm_layer,
++            spatial_merge_size=self.spatial_merge_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.merger",
++        )
++        if self.deepstack_visual_indexes is not None:
++            self.merger_list = nn.ModuleList(
++                [
++                    Qwen3_VisionPatchMerger(
++                        d_model=vision_config.out_hidden_size,
++                        context_dim=self.hidden_size,
++                        spatial_merge_size=self.spatial_merge_size,
++                        use_postshuffle_norm=True,
++                        norm_layer=norm_layer,
++                        quant_config=quant_config,
++                        prefix=f"{prefix}.merger_list.{layer_idx}",
++                    )
++                    for layer_idx in range(len(self.deepstack_visual_indexes))
++                ]
++            )
++
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype()
++        )
++        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
++            torch.get_default_dtype()
++        ):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.proj.weight.device
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
++        num_grid_per_side = self.num_grid_per_side
++        m_size = self.spatial_merge_size
++        hidden_dim = self.pos_embed.embedding_dim
++
++        outputs = []
++        for t, h, w in grid_thw:
++            h_idxs = torch.linspace(
++                0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device
++            )
++            w_idxs = torch.linspace(
++                0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device
++            )
++
++            h_floor = h_idxs.to(torch.long)
++            w_floor = w_idxs.to(torch.long)
++            h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
++            w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)
++
++            dh = h_idxs - h_floor
++            dw = w_idxs - w_floor
++
++            # Create meshgrid view for all h, w vars
++            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
++            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
++            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
++            h_floor_grid_idx = h_floor_grid * num_grid_per_side
++            h_ceil_grid_idx = h_ceil_grid * num_grid_per_side
++
++            # original computation of weights
++            # w00 = (1 - dh_grid) * (1 - dw_grid)
++            # w01 = (1 - dh_grid) * dw_grid
++            # w10 = dh_grid * (1 - dw_grid)
++            # w11 = dh_grid * dw_grid
++            # we reuse w11 here to avoid duplicate
++            # dh_grid * dw_grid computation
++            w11 = dh_grid * dw_grid
++            w10 = dh_grid - w11
++            w01 = dw_grid - w11
++            w00 = 1 - dh_grid - dw_grid + w11
++
++            idx00 = h_floor_grid_idx + w_floor_grid
++            idx01 = h_floor_grid_idx + w_ceil_grid
++            idx10 = h_ceil_grid_idx + w_floor_grid
++            idx11 = h_ceil_grid_idx + w_ceil_grid
++
++            indices = torch.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1)
++            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
++            weights = weights.to(dtype=self.dtype, device=self.device)
++
++            embeds = self.pos_embed(indices)
++            weighted_embeds = embeds * weights
++            p0, p1, p2, p3 = weighted_embeds.unbind(dim=0)
++            combined = p0 + p1 + p2 + p3
++
++            combined = combined.view(h * w, hidden_dim)
++            repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous()
++            repeated = repeated.view(
++                t, h // m_size, m_size, w // m_size, m_size, hidden_dim
++            )
++            repeated = repeated.permute(0, 1, 3, 2, 4, 5).reshape(-1, hidden_dim)
++            outputs.append(repeated)
++
++        return torch.cat(outputs, dim=0)
++
++    def compute_attn_mask_seqlen(
++        self,
++        cu_seqlens: torch.Tensor,
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        grid_thw: list[list[int]],
++    ) -> torch.Tensor:
++        hidden_states = x.to(device=self.device, dtype=self.dtype)
++        hidden_states = self.patch_embed(hidden_states)
++
++        if self.apply_vit_abs_pos_embed:
++            pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
++            hidden_states = hidden_states + pos_embeds
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
++        ).cumsum(
++            dim=0,
++            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
++        )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
++
++        hidden_states = hidden_states.unsqueeze(1)
++        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
++
++        hidden_states_list = []
++        deepstack_visual_indexes = self.deepstack_visual_indexes
++
++        for layer_num, blk in enumerate(self.blocks):
++            hidden_states = blk(
++                hidden_states,
++                cu_seqlens=cu_seqlens,
++                rotary_pos_emb=rotary_pos_emb,
++                max_seqlen=max_seqlen,
++                seqlens=seqlens,
++            )
++            if (
++                deepstack_visual_indexes is not None
++                and layer_num in deepstack_visual_indexes
++            ):
++                hidden_states_list.append(hidden_states)
++
++        hidden_states = self.merger(hidden_states)
++
++        # processing deepstack
++        if deepstack_visual_indexes is not None:
++            processed_hidden_states_list = [hidden_states]
++            for idx, x in enumerate(hidden_states_list):
++                x = self.merger_list[idx](x)
++                processed_hidden_states_list.append(x)
++            # we cat the original visual features and deepstack features
++            # along the feature dim
++            hidden_states = torch.cat(
++                processed_hidden_states_list, dim=1
++            )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
++
++        return hidden_states
++
++    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("attn.qkv.", "attn.q.", "q"),
++            ("attn.qkv.", "attn.k.", "k"),
++            ("attn.qkv.", "attn.v.", "v"),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: set[str] = set()
++
++        for name, loaded_weight in weights:
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader", default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++@support_torch_compile(
++    dynamic_arg_dims={
++        "input_ids": 0,
++        "positions": -1,
++        "intermediate_tensors": 0,
++        "inputs_embeds": 0,
++        "deepstack_input_embeds": 0,
++    }
++)
++class Qwen3MoeLLMModel(Qwen3MoeModel):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++
++        self.deepstack_multiscale_layer_start = 1
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for layer_idx, layer in enumerate(
++            self.layers[self.start_layer : self.end_layer]
++        ):
++            layer_idx = layer_idx + self.start_layer
++
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                residual,
++            )
++
++            if deepstack_input_embeds is not None and layer_idx in range(
++                0, len(deepstack_input_embeds)
++            ):
++                hidden_states = (
++                    hidden_states
++                    + deepstack_input_embeds[f"deepstack_input_embeds_{layer_idx}"]
++                )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors(
++                {"hidden_states": hidden_states, "residual": residual}
++            )
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3MoeForCausalLM, self).__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = Qwen3MoeLLMModel(
++            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
++        )
++        self.lm_head = ParallelLMHead(
++            config.vocab_size, config.hidden_size, quant_config=quant_config
++        )
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors
++        )
++
++
++class Qwen3OmniMoeThinkerProcessingInfo(
++    Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo
++):
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config
++
++    def get_hf_processor(self, **kwargs: object) -> Qwen3OmniMoeProcessor:
++        processor = self.ctx.get_hf_processor(
++            Qwen3OmniMoeProcessor,
++            use_fast=kwargs.pop("use_fast", True),
++            **kwargs,
++        )
++        if not hasattr(processor, "audio_token"):
++            processor.audio_token = "<|audio_pad|>"
++        if not hasattr(processor, "image_token"):
++            processor.image_token = "<|image_pad|>"
++        if not hasattr(processor, "video_token"):
++            processor.video_token = "<|video_pad|>"
++        return processor
++
++    def get_feature_extractor(self, **kwargs: object):
++        hf_processor = self.get_hf_processor(**kwargs)
++        feature_extractor = hf_processor.feature_extractor  # type: ignore
++        assert isinstance(feature_extractor, WhisperFeatureExtractor)
++        return feature_extractor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"audio": None, "image": None, "video": None}
++
++
++Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
++
++
++class Qwen3OmniMoeThinkerMultiModalProcessor(
++    Qwen2_5OmniThinkerMultiModalProcessor,
++):
++    def _get_feat_extract_output_lengths(
++        self, input_lengths: torch.Tensor
++    ) -> torch.Tensor:
++        input_lengths_leave = input_lengths % 100
++        feat_lengths = (input_lengths_leave - 1) // 2 + 1
++        output_lengths = (
++            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++        )
++        return feat_lengths, output_lengths
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++        tok_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        audios = mm_data.pop("audios", [])
++
++        def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray:
++            length = x.shape[-1]
++            if length % hop_length != 0:
++                pad_length = hop_length - (length % hop_length)
++                x = np.pad(x, (0, pad_length), mode="constant", constant_values=0)
++            return x
++
++        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
++        if audios:
++            # NOTE: Qwen3-Omni processor accept "audio"
++            # To make sure the cache works with padding=True, we pre-padded
++            # the audio to multiple of hop_length.
++            hop_length = self.info.get_feature_extractor().hop_length
++            mm_data["audio"] = [
++                pad_to_hop_length(audio, hop_length)
++                if isinstance(audio, np.ndarray)
++                else (pad_to_hop_length(audio[0], hop_length), audio[1])
++                for audio in audios
++            ]
++            mm_kwargs = dict(
++                **mm_kwargs,
++            )
++
++        hf_inputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++            tok_kwargs=tok_kwargs,
++        )
++
++        if (
++            "audio_feature_lengths" in hf_inputs
++            and "feature_attention_mask" in hf_inputs
++            and (audios := mm_data.get("audio", []))
++        ):
++            hop_length = self.info.get_feature_extractor().hop_length
++            audio_num_frames = []
++            for _, audio in enumerate(audios):
++                audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio)
++                num_frame = (
++                    (audio_length // hop_length)
++                    if audio_length % hop_length == 0
++                    else (audio_length // hop_length - 1)
++                )
++                audio_num_frames.append(num_frame)
++            hf_inputs["feature_attention_mask"] = [
++                torch.ones(num_frame) for num_frame in audio_num_frames
++            ]
++            hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames)
++        return hf_inputs
++
++    def _maybe_apply_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        prompt_ids: list[int],
++        mm_kwargs: MultiModalKwargsItems,
++        mm_prompt_updates: MultiModalPromptUpdates,
++        is_update_applied: bool,
++    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
++        """
++        Qwen3-Omni reimplements this function to handle `use_audio_in_video`.
++        """
++        mm_item_counts = mm_items.get_all_counts()
++        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
++
++        use_audio_in_video = False
++        if "video" in mm_kwargs:
++            for item in mm_kwargs["video"]:
++                if item and item["use_audio_in_video"].data:
++                    use_audio_in_video = True
++                else:
++                    use_audio_in_video = False
++
++        if use_audio_in_video and "video" in mm_item_counts:
++            assert "audio" in mm_item_counts
++            mm_item_counts["audio"] -= mm_item_counts["video"]
++
++        # Special case with `use_audio_in_video=True`
++        if use_audio_in_video:
++            if is_update_applied:
++                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
++            (
++                prompt_ids,
++                mm_placeholders,
++            ) = self._apply_prompt_updates(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
++        # normal case with `use_audio_in_video=False`
++        elif is_update_applied:
++            mm_placeholders = self._find_mm_placeholders(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(
++                mm_placeholders,
++                mm_item_counts,
++            )
++        else:
++            (
++                prompt_ids,
++                prompt,
++                mm_placeholders,
++            ) = self._apply_prompt_updates(
++                prompt_ids,
++                mm_prompt_updates,
++            )
++            self._validate_mm_placeholders(
++                mm_placeholders,
++                mm_item_counts,
++            )
++        tokenizer = self.info.get_tokenizer()
++        prompt = decode_tokens(tokenizer, prompt_ids)
++        return prompt_ids, prompt, mm_placeholders
++
++    def get_updates_use_audio_in_video(
++        self,
++        thinker_config: PretrainedConfig,
++        audio_len: int,
++        video_grid_thw: Union[list[int], torch.Tensor],
++        video_second_per_grid_t: float,
++    ) -> list[int]:
++        shift = 0
++        audio_token_id = thinker_config.audio_token_id
++        video_token_id = thinker_config.video_token_id
++        audio_start_token_id = thinker_config.audio_start_token_id
++        audio_end_token_id = thinker_config.audio_end_token_id
++        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
++        position_id_per_seconds = thinker_config.position_id_per_seconds
++        audio_token_indices = np.arange(next(iter([audio_len])))
++        curr_video_grid_thw = next(iter([video_grid_thw]))
++        height = curr_video_grid_thw[1] // spatial_merge_size
++        width = curr_video_grid_thw[2] // spatial_merge_size
++        video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
++        video_token_indices = np.broadcast_to(
++            video_token_indices, (video_token_indices.shape[0], height, width)
++        ).reshape(-1)
++        video_token_indices = (
++            (video_token_indices + shift)
++            * next(iter([video_second_per_grid_t]))
++            * position_id_per_seconds
++        )
++        video_data_index, audio_data_index = 0, 0
++        updates = [audio_start_token_id]
++        while video_data_index < len(video_token_indices) and audio_data_index < len(
++            audio_token_indices
++        ):
++            if (
++                video_token_indices[video_data_index]
++                <= audio_token_indices[audio_data_index]
++            ):
++                updates += [video_token_id]
++                video_data_index += 1
++            else:
++                updates += [audio_token_id]
++                audio_data_index += 1
++        if video_data_index < len(video_token_indices):
++            updates += [video_token_id] * (len(video_token_indices) - video_data_index)
++        if audio_data_index < len(audio_token_indices):
++            updates += [audio_token_id] * (len(audio_token_indices) - audio_data_index)
++        updates += [audio_end_token_id]
++        return updates
++
++    def _get_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargsItems,
++    ) -> Sequence[PromptUpdate]:
++        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        tokenizer = self.info.get_tokenizer()
++        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
++        vocab = tokenizer.get_vocab()
++
++        audio_token = processor.audio_token
++        image_token = processor.image_token
++        video_token = processor.video_token
++        audio_token_id = vocab[audio_token]
++        image_token_id = vocab[image_token]
++        video_token_id = vocab[video_token]
++
++        out_mm_data = out_mm_kwargs.get_data()
++        audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
++        feature_attention_mask = out_mm_data.get("feature_attention_mask")
++        if audio_feature_lengths is None and feature_attention_mask is None:
++            audio_output_lengths = []
++        elif audio_feature_lengths is not None:
++            _, audio_output_lens = self._get_feat_extract_output_lengths(
++                audio_feature_lengths
++            )
++            audio_output_lengths = audio_output_lens.tolist()
++        elif feature_attention_mask is not None:
++            assert isinstance(feature_attention_mask, torch.Tensor)
++            _, audio_output_lens = self._get_feat_extract_output_lengths(
++                feature_attention_mask.sum(-1)
++            )
++            audio_output_lengths = audio_output_lens.tolist()
++
++        # number of audios read from video.
++        audio_in_video_item_idx = 0
++        audio_item_idx = 0
++
++        def get_replacement_qwen2_audio(item_idx: int):
++            nonlocal audio_item_idx
++            item_idx += audio_in_video_item_idx
++
++            audio_item_idx += 1
++
++            num_features = audio_output_lengths[item_idx]
++            if num_features == 0:
++                audios = mm_items.get_items("audio", AudioProcessorItems)
++                audio = audios.get(item_idx)
++                raise ValueError(
++                    f"The audio {audio} (len={len(audio)}) is too short "
++                    "to be represented inside the model"
++                )
++
++            return [audio_token_id] * num_features
++
++        def get_replacement_qwen2_vision(item_idx: int, modality: str):
++            grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
++            assert isinstance(grid_thw, torch.Tensor)
++            merge_length = image_processor.merge_size**2
++
++            token_id = image_token_id if modality == "image" else video_token_id
++            return [token_id] * (int(grid_thw.prod()) // merge_length)
++
++        use_audio_in_video = hf_processor_mm_kwargs.get("use_audio_in_video", False)
++        thinker_config = self.info.get_hf_config()
++
++        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
++            nonlocal audio_in_video_item_idx
++            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
++            video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
++
++            audio_in_video_item_idx += 1
++
++            second_per_grid_ts = hf_processor_mm_kwargs.get("second_per_grid_ts", None)
++            if second_per_grid_ts:
++                video_second_per_grid_t = second_per_grid_ts[item_idx]
++            else:
++                video_second_per_grid_t = 1.0
++
++            return self.get_updates_use_audio_in_video(
++                thinker_config=thinker_config,
++                audio_len=audio_num_features,
++                video_grid_thw=video_grid_thw,
++                video_second_per_grid_t=video_second_per_grid_t,
++            )
++
++        video_replacement_fn = (
++            get_replacement_qwen2_use_audio_in_video
++            if use_audio_in_video
++            else partial(get_replacement_qwen2_vision, modality="video")
++        )
++
++        return [
++            PromptReplacement(
++                modality="audio",
++                target=audio_token,
++                replacement=get_replacement_qwen2_audio,
++            ),
++            PromptReplacement(
++                modality="image",
++                target=image_token,
++                replacement=partial(get_replacement_qwen2_vision, modality="image"),
++            ),
++            PromptReplacement(
++                modality="video",
++                target=video_token,
++                replacement=video_replacement_fn,
++            ),
++        ]
++
++    def _validate_mm_placeholders(
++        self,
++        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
++        mm_item_counts: Mapping[str, int],
++    ) -> None:
++        BaseMultiModalProcessor[
++            Qwen2_5OmniThinkerProcessingInfo
++        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
++
++    def _get_raw_input_ids(
++        self,
++        token_ids: list[int],
++        use_audio_in_video: bool = False,
++    ) -> list[int]:
++        tokenizer = self.info.get_tokenizer()
++        vision_bos_token = tokenizer.encode(tokenizer.vision_bos_token)[0]
++        vision_eos_token = tokenizer.encode(tokenizer.vision_eos_token)[0]
++        audio_bos_token = tokenizer.encode(tokenizer.audio_bos_token)[0]
++        audio_eos_token = tokenizer.encode(tokenizer.audio_eos_token)[0]
++        audio_token = tokenizer.encode("<|audio_pad|>")[0]
++        image_token = tokenizer.encode("<|image_pad|>")[0]
++        video_token = tokenizer.encode("<|video_pad|>")[0]
++
++        result = token_ids[:]
++        if use_audio_in_video:
++            while True:
++                start = None
++                for i in range(len(result) - 1):
++                    if result[i : i + 2] == [vision_bos_token, audio_bos_token]:
++                        start = i
++                        break
++                if start is not None:
++                    end = None
++                    for i in range(start + 2, len(result) - 1):
++                        if result[i : i + 2] == [audio_eos_token, vision_eos_token]:
++                            end = i
++                            break
++                    if end is not None:
++                        result = (
++                            result[:start]
++                            + [vision_bos_token, video_token, vision_eos_token]
++                            + result[end + 2 :]
++                        )
++                else:
++                    break
++
++        for mm_token in [audio_token, image_token, video_token]:
++            compressed = []
++            for x in result:
++                if x != mm_token or (not compressed or compressed[-1] != mm_token):
++                    compressed.append(x)
++            result = compressed
++
++        return result
++
++
++class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin):
++    def _validate_and_reshape_mm_tensor(
++        self, mm_input: object, name: str, dim: int = 0
++    ) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. Got type: {type(mm_input)}")
++        if name == "feature_attention_mask":
++            dim = -1
++        if isinstance(mm_input, torch.Tensor):
++            return torch.concat(list(mm_input), dim=dim)
++        else:
++            if isinstance(mm_input[0], list):
++                return torch.concat(
++                    [torch.concat(mm_input[i], dim=dim) for i in range(len(mm_input))],
++                    dim=dim,
++                )
++            else:
++                return torch.concat(mm_input, dim=dim)
++
++    def _get_feat_extract_output_lengths(
++        self, input_lengths: torch.Tensor
++    ) -> torch.Tensor:
++        input_lengths_leave = input_lengths % 100
++        feat_lengths = (input_lengths_leave - 1) // 2 + 1
++        output_lengths = (
++            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
++        )
++        return output_lengths, output_lengths
++
++    def _process_audio_input(
++        self,
++        audio_input: Qwen2AudioFeatureInputs,
++        audio_hashes: list[str] = None,
++        cached_audio_features: torch.Tensor = None,
++    ) -> torch.Tensor:
++        input_features = audio_input["input_features"]
++        audio_feature_lengths = audio_input["audio_feature_lengths"]
++
++        if input_features.ndim == 3:
++            assert input_features.shape[0] == 1
++            input_features = input_features.squeeze(0)
++
++        if not isinstance(audio_feature_lengths, torch.Tensor):
++            audio_feature_lengths = torch.cat(audio_feature_lengths)
++        if audio_feature_lengths.ndim == 2:
++            audio_feature_lengths = audio_feature_lengths.reshape(-1)
++
++        audio_feat_lengths, audio_output_lengths = (
++            self._get_feat_extract_output_lengths(audio_feature_lengths)
++        )
++
++        audio_outputs = self.audio_tower(
++            input_features.to(self.audio_tower.dtype),
++            feature_lens=audio_feature_lengths,
++            aftercnn_lens=audio_feat_lengths,
++        )
++        audio_features = audio_outputs.last_hidden_state
++        return audio_features.split(audio_output_lengths.tolist())
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    Qwen3OmniMoeThinkerMultiModalProcessor,
++    info=Qwen3OmniMoeThinkerProcessingInfo,
++    dummy_inputs=Qwen3OmniMoeThinkerDummyInputsBuilder,
++)
++class Qwen3OmniMoeThinkerForConditionalGeneration(
++    nn.Module,
++    SupportsMultiModal,
++    SupportsPP,
++    Qwen3OmniMoeConditionalGenerationMixin,
++):
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "thinker.lm_head.": "language_model.lm_head.",
++            "thinker.model.": "language_model.model.",
++            "thinker.": "",
++        }
++    )
++
++    @classmethod
++    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
++        if modality.startswith("image"):
++            return "<|vision_start|><|image_pad|><|vision_end|>"
++        if modality.startswith("video"):
++            return "<|vision_start|><|video_pad|><|vision_end|>"
++        if modality.startswith("audio"):
++            return "<|audio_start|><|audio_pad|><|audio_end|>"
++
++        raise ValueError("Only image, video or audio modality is supported")
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        thinker_config: Qwen3OmniMoeThinkerConfig = (
++            vllm_config.model_config.hf_config.thinker_config
++        )
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = thinker_config
++        self.multimodal_config = multimodal_config
++
++        # force "use_flash_attention_2=True" to audio tower to align
++        # the results.
++        if flash_attn is not None:
++            audio_config = thinker_config.audio_config
++            audio_config._attn_implementation_autoset = True
++            audio_config._attn_implementation = "flash_attention_2"
++        else:
++            logger.warning(
++                "flash_attn is not available, the model may not yield the "
++                "exactly same result as the transformers implementation "
++                "in the audio tower part."
++            )
++
++        self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config)
++
++        self.visual = Qwen3Omni_VisionTransformer(
++            vision_config=thinker_config.vision_config,
++            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "visual"),
++        )
++        self.quant_config = quant_config
++
++        self.language_model = Qwen3MoeLLMForCausalLM(
++            vllm_config=vllm_config.with_hf_config(
++                thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]
++            ),
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors
++        )
++
++        self.use_deepstack = hasattr(
++            thinker_config.vision_config, "deepstack_visual_indexes"
++        )
++        self.deepstack_num_level = (
++            len(thinker_config.vision_config.deepstack_visual_indexes)
++            if self.use_deepstack
++            else 0
++        )
++        # register buffer for deepstack
++        self.deepstack_input_embeds = (
++            [
++                torch.zeros(
++                    vllm_config.scheduler_config.max_num_batched_tokens,
++                    thinker_config.text_config.hidden_size,
++                )
++                for _ in range(self.deepstack_num_level)
++            ]
++            if self.use_deepstack
++            else None
++        )
++        self.visual_dim = thinker_config.vision_config.out_hidden_size
++        self.multiscale_dim = self.visual_dim * self.deepstack_num_level
++
++    def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
++        # get deepstack_input_embeds from buffer, and clear the buffer
++        return IntermediateTensors(
++            {
++                f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][
++                    :num_tokens
++                ]
++                for idx in range(self.deepstack_num_level)
++            }
++        )
++
++    def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None:
++        # set deepstack_input_embeds to buffer
++        num_tokens = deepstack_input_embeds.size(1)
++        if num_tokens > self.deepstack_input_embeds[0].size(0):
++            self.deepstack_input_embeds = [
++                torch.zeros(
++                    num_tokens,
++                    self.config.text_config.hidden_size,
++                    device=self.deepstack_input_embeds[0].device,
++                    dtype=self.deepstack_input_embeds[0].dtype,
++                )
++                for _ in range(self.deepstack_num_level)
++            ]
++        for idx in range(self.deepstack_num_level):
++            self.deepstack_input_embeds[idx][:num_tokens].copy_(
++                deepstack_input_embeds[idx]
++            )
++
++    def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
++        # clear deepstack_input_embeds in buffer
++        if num_tokens > 0:
++            for idx in range(self.deepstack_num_level):
++                self.deepstack_input_embeds[idx][:num_tokens].zero_()
++
++    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
++        mm_input_by_modality = {}
++
++        # Preserve the order of modalities if there are multiple of them
++        # from the order of kwargs.
++        for input_key in kwargs:
++            if (
++                input_key in ("pixel_values", "image_embeds")
++                and "image" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
++                    **kwargs
++                )
++            if (
++                input_key in ("pixel_values_videos", "video_embeds")
++                and "video" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
++                    **kwargs
++                )
++            if (
++                input_key in ("input_audio_features")
++                and "audio" not in mm_input_by_modality
++            ):
++                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(
++                    **kwargs
++                )
++        return mm_input_by_modality
++
++    def get_language_model(self) -> torch.nn.Module:
++        return self.language_model
++
++    def get_multimodal_embeddings(
++        self, **kwargs: object
++    ) -> Optional[MultiModalEmbeddings]:
++        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
++        if not mm_input_by_modality:
++            return []
++
++        # The result multimodal_embeddings is tuple of tensors, with each
++        # tensor correspoending to a multimodal data item (image or video).
++        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
++
++        # NOTE: It is important to iterate over the keys in this dictionary
++        # to preserve the order of the modalities.
++        for modality in mm_input_by_modality:
++            multimodal_input = mm_input_by_modality[modality]
++            if modality == "image":
++                vision_embeddings = self._process_image_input(multimodal_input)
++                multimodal_embeddings += vision_embeddings
++            if modality == "video":
++                video_embeddings = self._process_video_input(multimodal_input)
++                multimodal_embeddings += video_embeddings
++            if modality == "audio":
++                audio_embeddings = self._process_audio_input(multimodal_input)
++                multimodal_embeddings += audio_embeddings
++        return multimodal_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
++        *,
++        is_multimodal: Optional[torch.Tensor] = None,
++        handle_oov_mm_token: bool = False,
++    ) -> torch.Tensor:
++        # inputs_embeds = self._get_text_embeddings(
++        #     input_ids,
++        #     self.language_model.get_input_embeddings,
++        #     is_multimodal=is_multimodal,
++        #     handle_oov_mm_token=handle_oov_mm_token,
++        # )
++
++        # if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
++        #     return inputs_embeds
++
++        # deepstack_input_embeds = None
++        # # TODO (ywang96): support overlapping modalitiy embeddings so that
++        # # `use_audio_in_video` will work on V1.
++        # # split the feat dim to obtain multi-scale visual feature
++        # has_vision_embeddings = [
++        #     embeddings.shape[-1] != self.config.text_config.hidden_size
++        #     for embeddings in multimodal_embeddings
++        # ]
++        # if self.visual.deepstack_visual_indexes is not None and any(
++        #     has_vision_embeddings
++        # ):
++        #     multiscale_len = len(self.visual.deepstack_visual_indexes)
++        #     multimodal_embeddings_multiscale = []
++        #     is_vision = torch.zeros_like(is_multimodal)
++        #     mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
++        #     mm_position_idx = 0
++        #     for index, embeddings in enumerate(multimodal_embeddings):
++        #         num_tokens = embeddings.shape[0]
++        #         current_positions = mm_positions[
++        #             mm_position_idx : mm_position_idx + num_tokens
++        #         ]
++
++        #         # Vision embeddings
++        #         if embeddings.shape[-1] != self.config.text_config.hidden_size:
++        #             visual_dim = embeddings.shape[-1] // (multiscale_len + 1)
++        #             multi_dim = visual_dim * multiscale_len
++        #             embeddings_main, embeddings_multiscale = torch.split(
++        #                 embeddings, [visual_dim, multi_dim], dim=-1
++        #             )
++        #             multimodal_embeddings[index] = embeddings_main
++        #             multimodal_embeddings_multiscale.append(embeddings_multiscale)
++        #             is_vision[current_positions] = True
++
++        #         # Audio embeddings
++        #         else:
++        #             is_vision[current_positions] = False
++
++        #         mm_position_idx += num_tokens
++
++        #     deepstack_input_embeds = inputs_embeds.new_zeros(
++        #         inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
++        #     )
++        #     deepstack_input_embeds = _merge_multimodal_embeddings(
++        #         inputs_embeds=deepstack_input_embeds,
++        #         multimodal_embeddings=multimodal_embeddings_multiscale,
++        #         is_multimodal=is_vision,
++        #     )
++        #     deepstack_input_embeds = (
++        #         deepstack_input_embeds.view(
++        #             inputs_embeds.shape[0], multiscale_len, visual_dim
++        #         )
++        #         .permute(1, 0, 2)
++        #         .contiguous()
++        #     )
++        #     self._set_deepstack_input_embeds(deepstack_input_embeds)
++
++        # inputs_embeds = _merge_multimodal_embeddings(
++        #     inputs_embeds=inputs_embeds,
++        #     multimodal_embeddings=multimodal_embeddings,
++        #     is_multimodal=is_multimodal,
++        # )
++
++        # return inputs_embeds
++        # inputs_embeds = self._get_text_embeddings(
++        #     input_ids,
++        #     self.language_model.get_input_embeddings,
++        #     is_multimodal=is_multimodal,
++        #     handle_oov_mm_token=handle_oov_mm_token,
++        # )
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++
++        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
++            return inputs_embeds
++
++        deepstack_input_embeds = None
++        # TODO (ywang96): support overlapping modalitiy embeddings so that
++        # `use_audio_in_video` will work on V1.
++        # split the feat dim to obtain multi-scale visual feature
++        has_vision_embeddings = [
++            embeddings.shape[-1] != self.config.text_config.hidden_size
++            for embeddings in multimodal_embeddings
++        ]
++        if self.visual.deepstack_visual_indexes is not None and any(
++            has_vision_embeddings
++        ):
++            # Handle vision embeddings...
++            multiscale_len = len(self.visual.deepstack_visual_indexes)
++            multimodal_embeddings_multiscale = []
++            for index, embeddings in enumerate(multimodal_embeddings):
++                # Vision embeddings
++                if embeddings.shape[-1] != self.config.text_config.hidden_size:
++                    visual_dim = embeddings.shape[-1] // (multiscale_len + 1)
++                    multi_dim = visual_dim * multiscale_len
++                    embeddings_main, embeddings_multiscale = torch.split(
++                        embeddings, [visual_dim, multi_dim], dim=-1
++                    )
++                    # Split it into two different scales, multimodal_embeddings + multimodal_embeddings_multiscale
++                    multimodal_embeddings[index] = embeddings_main
++                    multimodal_embeddings_multiscale.append(embeddings_multiscale)
++
++            # multiscale_len x size
++            deepstack_input_embeds = inputs_embeds.new_zeros(
++                inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
++            )
++            deepstack_input_embeds = merge_multimodal_embeddings(
++                input_ids,
++                deepstack_input_embeds,
++                multimodal_embeddings_multiscale,
++                placeholder_token_id=[self.config.image_token_id, self.config.video_token_id],
++            )
++            deepstack_input_embeds = (
++                deepstack_input_embeds.view(
++                    inputs_embeds.shape[0], multiscale_len, visual_dim
++                )
++                .permute(1, 0, 2)
++                .contiguous()
++            )
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
++
++
++        inputs_embeds = merge_multimodal_embeddings(
++            input_ids,
++            inputs_embeds,
++            multimodal_embeddings,
++            [
++                self.config.image_token_id,
++                self.config.video_token_id,
++                self.config.audio_token_id,
++            ],
++        )
++
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        if (
++            self.use_deepstack
++            and inputs_embeds is not None
++            and get_pp_group().is_first_rank
++        ):
++            deepstack_input_embeds = self._get_deepstack_input_embeds(
++                inputs_embeds.size(0)
++            )
++        else:
++            deepstack_input_embeds = None
++
++        hidden_states = self.language_model.model(
++            input_ids,
++            positions,
++            intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++            # args for deepstack
++            deepstack_input_embeds=deepstack_input_embeds,
++        )
++
++        if inputs_embeds is not None and get_pp_group().is_first_rank:
++            self._clear_deepstack_input_embeds(inputs_embeds.size(0))
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++
++    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=["talker.", "code2wav."],
++        )
++        loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++        return loaded_weights
+diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
+new file mode 100644
+index 000000000..22948aee4
+--- /dev/null
++++ b/vllm/model_executor/models/qwen3_vl.py
+@@ -0,0 +1,1478 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++
++# Copyright 2025 The vLLM team.
++# Copyright 2025 The Qwen Team.
++# Copyright 2025 The HuggingFace Inc. team.
++# All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen3VL model compatible with HuggingFace weights."""
++from collections.abc import Iterable, Mapping, Sequence
++from functools import partial
++from typing import Any, Callable, Optional, Union
++
++import numpy as np
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import BatchFeature
++from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
++from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
++from transformers.models.qwen3_vl import (Qwen3VLProcessor,
++                                          Qwen3VLVideoProcessor)
++from transformers.models.qwen3_vl.configuration_qwen3_vl import (
++    Qwen3VLConfig, Qwen3VLVisionConfig)
++from transformers.video_utils import VideoMetadata
++
++from vllm.attention.layer import check_upstream_fa_availability
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.gptq import GPTQConfig
++from vllm.model_executor.layers.quantization.gptq_marlin import (
++    GPTQMarlinConfig)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalKwargsItem,
++                                    MultiModalKwargsItems, VideoItem)
++from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
++                                   MultiModalDataParser)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        PromptReplacement, PromptUpdate,
++                                        PromptUpdateDetails)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder
++from vllm.platforms import _Backend
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.config import uses_mrope
++from vllm.utils import is_list_of
++
++from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
++                         SupportsMultiModal, SupportsPP)
++from .qwen2_5_vl import (Qwen2_5_VisionAttention,
++                         Qwen2_5_VisionRotaryEmbedding,
++                         Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs,
++                         Qwen2_5_VLImagePixelInputs,
++                         Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs,
++                         Qwen2_5_VLVideoPixelInputs)
++from .qwen2_vl import Qwen2VLProcessingInfo
++from .qwen3 import Qwen3ForCausalLM, Qwen3Model
++from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
++                    maybe_prefix, merge_multimodal_embeddings)
++from .vision import get_vit_attn_backend
++
++logger = init_logger(__name__)
++
++
++class Qwen3_VisionPatchEmbed(nn.Module):
++
++    def __init__(
++        self,
++        patch_size: int = 14,
++        temporal_patch_size: int = 2,
++        in_channels: int = 3,
++        hidden_size: int = 1152,
++    ) -> None:
++        super().__init__()
++        self.patch_size = patch_size
++        self.temporal_patch_size = temporal_patch_size
++        self.hidden_size = hidden_size
++
++        kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.proj = nn.Conv3d(in_channels,
++                              hidden_size,
++                              kernel_size=kernel_size,
++                              stride=kernel_size,
++                              bias=True)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        L, C = x.shape
++        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
++                   self.patch_size)
++        x = self.proj(x).view(L, self.hidden_size)
++        return x
++
++
++class Qwen3_VisionMLP(nn.Module):
++
++    def __init__(self,
++                 in_features: int,
++                 hidden_features: int,
++                 bias: bool = False,
++                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.linear_fc1 = ColumnParallelLinear(in_features,
++                                               hidden_features,
++                                               bias=bias,
++                                               quant_config=quant_config,
++                                               return_bias=False,
++                                               prefix=f"{prefix}.linear_fc1")
++        self.linear_fc2 = RowParallelLinear(hidden_features,
++                                            in_features,
++                                            bias=bias,
++                                            quant_config=quant_config,
++                                            return_bias=False,
++                                            prefix=f"{prefix}.linear_fc2")
++        self.act_fn = act_fn
++
++    def forward(self, x: torch.Tensor):
++        mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
++        return mlp_output
++
++
++class Qwen3_VisionBlock(nn.Module):
++
++    def __init__(
++        self,
++        dim: int,
++        num_heads: int,
++        mlp_hidden_dim: int,
++        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.norm1 = norm_layer(dim)
++        self.norm2 = norm_layer(dim)
++        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
++                                            num_heads=num_heads,
++                                            projection_size=dim,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.attn")
++        self.mlp = Qwen3_VisionMLP(dim,
++                                   mlp_hidden_dim,
++                                   act_fn=act_fn,
++                                   bias=True,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.mlp")
++
++    def forward(
++            self,
++            x: torch.Tensor,
++            cu_seqlens: torch.Tensor,
++            rotary_pos_emb: torch.Tensor,
++            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
++            seqlens: Optional[list[int]] = None,  # Only used for xFormers
++    ) -> torch.Tensor:
++        x = x + self.attn(self.norm1(x),
++                          cu_seqlens=cu_seqlens,
++                          rotary_pos_emb=rotary_pos_emb,
++                          max_seqlen=max_seqlen,
++                          seqlens=seqlens)
++
++        x = x + self.mlp(self.norm2(x))
++        return x
++
++
++class Qwen3_VisionPatchMerger(nn.Module):
++
++    def __init__(
++        self,
++        d_model: int,
++        context_dim: int,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        spatial_merge_size: int = 2,
++        use_postshuffle_norm: bool = False,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++
++        self.use_postshuffle_norm = use_postshuffle_norm
++        if self.use_postshuffle_norm:
++            context_dim = self.hidden_size
++
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.use_postshuffle_norm = use_postshuffle_norm
++        self.norm = norm_layer(
++            self.hidden_size if use_postshuffle_norm else context_dim)
++        self.linear_fc1 = ColumnParallelLinear(self.hidden_size,
++                                               self.hidden_size,
++                                               bias=True,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.linear_fc1")
++        self.act_fn = nn.GELU()
++        self.linear_fc2 = RowParallelLinear(self.hidden_size,
++                                            d_model,
++                                            bias=True,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.linear_fc2")
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        if self.use_postshuffle_norm:
++            x = self.norm(x.view(-1, self.hidden_size))
++        else:
++            x = self.norm(x).view(-1, self.hidden_size)
++
++        x_parallel, _ = self.linear_fc1(x)
++        x_parallel = self.act_fn(x_parallel)
++        out, _ = self.linear_fc2(x_parallel)
++        return out
++
++
++class Qwen3_VisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        vision_config: Qwen3VLVisionConfig,
++        norm_eps: float = 1e-6,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = vision_config.hidden_size
++        self.num_heads = vision_config.num_heads
++        self.num_position_embeddings = vision_config.num_position_embeddings
++        self.patch_size = vision_config.patch_size
++        self.spatial_merge_size = vision_config.spatial_merge_size
++        self.spatial_merge_unit = self.spatial_merge_size**2
++        self.temporal_patch_size = vision_config.temporal_patch_size
++        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
++
++        self.patch_embed = Qwen3_VisionPatchEmbed(
++            patch_size=self.patch_size,
++            temporal_patch_size=self.temporal_patch_size,
++            in_channels=vision_config.in_channels,
++            hidden_size=self.hidden_size,
++        )
++
++        self.pos_embed = nn.Embedding(self.num_position_embeddings,
++                                      self.hidden_size)
++
++        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
++        head_dim = self.hidden_size // self.num_heads
++        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
++
++        self.blocks = nn.ModuleList([
++            Qwen3_VisionBlock(
++                dim=self.hidden_size,
++                num_heads=self.num_heads,
++                mlp_hidden_dim=vision_config.intermediate_size,
++                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
++                norm_layer=norm_layer,
++                quant_config=quant_config,
++                prefix=f"{prefix}.blocks.{layer_idx}")
++            for layer_idx in range(vision_config.depth)
++        ])
++
++        self.merger = Qwen3_VisionPatchMerger(
++            d_model=vision_config.out_hidden_size,
++            context_dim=self.hidden_size,
++            norm_layer=norm_layer,
++            spatial_merge_size=self.spatial_merge_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.merger",
++        )
++
++        self.deepstack_merger_list = nn.ModuleList([
++            Qwen3_VisionPatchMerger(
++                d_model=vision_config.out_hidden_size,
++                context_dim=self.hidden_size,
++                spatial_merge_size=self.spatial_merge_size,
++                use_postshuffle_norm=True,
++                norm_layer=norm_layer,
++                quant_config=quant_config,
++                prefix=f"{prefix}.deepstack_merger_list.{layer_idx}")
++            for layer_idx in range(len(self.deepstack_visual_indexes))
++        ])
++
++        self.attn_backend = get_vit_attn_backend(
++            head_size=head_dim, dtype=torch.get_default_dtype())
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.proj.weight.device
++
++    def rot_pos_emb(self, grid_thw):
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
++            hpos_ids = hpos_ids.flatten()
++
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            )
++            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
++            wpos_ids = wpos_ids.flatten()
++            pos_ids.append(
++                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def fast_pos_embed_interpolate(self, grid_thw):
++        num_grid_per_side = int(self.num_position_embeddings**0.5)
++
++        idx_list = [[] for _ in range(4)]
++        weight_list = [[] for _ in range(4)]
++
++        for t, h, w in grid_thw:
++            h_idxs = torch.linspace(0,
++                                    num_grid_per_side - 1,
++                                    h,
++                                    dtype=torch.float32)
++            w_idxs = torch.linspace(0,
++                                    num_grid_per_side - 1,
++                                    w,
++                                    dtype=torch.float32)
++
++            h_idxs_floor = h_idxs.to(torch.long)
++            w_idxs_floor = w_idxs.to(torch.long)
++            h_idxs_ceil = torch.clamp(h_idxs.to(torch.long) + 1,
++                                      max=num_grid_per_side - 1)
++            w_idxs_ceil = torch.clamp(w_idxs.to(torch.long) + 1,
++                                      max=num_grid_per_side - 1)
++
++            dh = h_idxs - h_idxs_floor
++            dw = w_idxs - w_idxs_floor
++
++            idx_list[0].extend(((h_idxs_floor * num_grid_per_side)[None].T +
++                                w_idxs_floor[None]).flatten().tolist() * t)
++            idx_list[1].extend(((h_idxs_floor * num_grid_per_side)[None].T +
++                                w_idxs_ceil[None]).flatten().tolist() * t)
++            idx_list[2].extend(((h_idxs_ceil * num_grid_per_side)[None].T +
++                                w_idxs_floor[None]).flatten().tolist() * t)
++            idx_list[3].extend(((h_idxs_ceil * num_grid_per_side)[None].T +
++                                w_idxs_ceil[None]).flatten().tolist() * t)
++
++            weight_list[0].extend(
++                ((1 - dh)[None].T * (1 - dw)[None]).flatten().tolist() * t)
++            weight_list[1].extend(
++                ((1 - dh)[None].T * dw[None]).flatten().tolist() * t)
++            weight_list[2].extend(
++                (dh[None].T * (1 - dw)[None]).flatten().tolist() * t)
++            weight_list[3].extend(
++                (dh[None].T * dw[None]).flatten().tolist() * t)
++
++        device = self.pos_embed.weight.device
++        dtype = self.pos_embed.weight.dtype
++
++        p0 = self.pos_embed(
++            torch.tensor(
++                idx_list[0], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[0], dtype=dtype, device=device)[:, None]
++        p1 = self.pos_embed(
++            torch.tensor(
++                idx_list[1], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[1], dtype=dtype, device=device)[:, None]
++        p2 = self.pos_embed(
++            torch.tensor(
++                idx_list[2], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[2], dtype=dtype, device=device)[:, None]
++        p3 = self.pos_embed(
++            torch.tensor(
++                idx_list[3], dtype=torch.long, device=device)) * torch.tensor(
++                    weight_list[3], dtype=dtype, device=device)[:, None]
++
++        patch_pos_embeds = p0 + p1 + p2 + p3
++        patch_pos_embeds = patch_pos_embeds.split(
++            [t * h * w for t, h, w in grid_thw])
++        patch_pos_embeds_permute = []
++        m_size = self.spatial_merge_size
++        for pos_embed, (t, h, w) in zip(patch_pos_embeds, grid_thw):
++            pos_embed = pos_embed.view(t, h // m_size, m_size, w // m_size,
++                                       m_size, -1).permute(0, 1, 3, 2, 4,
++                                                           5).flatten(0, 4)
++            patch_pos_embeds_permute.append(pos_embed)
++        patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
++        return patch_pos_embeds
++
++    def compute_attn_mask_seqlen(
++        self,
++        cu_seqlens: torch.Tensor,
++    ) -> tuple[Optional[int], Optional[list[int]]]:
++        max_seqlen, seqlens = None, None
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++        elif self.attn_backend == _Backend.XFORMERS:
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++        return max_seqlen, seqlens
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        grid_thw: list[list[int]],
++    ) -> torch.Tensor:
++        hidden_states = x.to(device=self.device, dtype=self.dtype)
++        hidden_states = self.patch_embed(hidden_states)
++
++        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
++        hidden_states = hidden_states + pos_embeds
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        cu_seqlens = torch.repeat_interleave(
++            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
++                dim=0,
++                dtype=grid_thw.dtype
++                if torch.jit.is_tracing() else torch.int32,
++            )
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
++
++        hidden_states = hidden_states.unsqueeze(1)
++        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
++        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
++
++        deepstack_feature_lists = []
++        for layer_num, blk in enumerate(self.blocks):
++            hidden_states = blk(hidden_states,
++                                cu_seqlens=cu_seqlens,
++                                rotary_pos_emb=rotary_pos_emb,
++                                max_seqlen=max_seqlen,
++                                seqlens=seqlens)
++            if layer_num in self.deepstack_visual_indexes:
++                deepstack_merger_idx = self.deepstack_visual_indexes.index(
++                    layer_num)
++                deepstack_feature = self.deepstack_merger_list[
++                    deepstack_merger_idx](hidden_states)
++                deepstack_feature_lists.append(deepstack_feature)
++        hidden_states = self.merger(hidden_states)
++        hidden_states = torch.cat(
++            [hidden_states] + deepstack_feature_lists,
++            dim=1)  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
++        return hidden_states
++
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("attn.qkv.", "attn.q.", "q"),
++            ("attn.qkv.", "attn.k.", "k"),
++            ("attn.qkv.", "attn.v.", "v"),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: set[str] = set()
++
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3VLConfig)
++
++    def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
++        return self.ctx.get_hf_processor(
++            Qwen3VLProcessor,
++            use_fast=kwargs.pop("use_fast", True),
++            **kwargs,
++        )
++
++    def get_tokenizer(self):
++        return self.ctx.tokenizer
++
++    def get_image_processor(self,
++                            **kwargs: object) -> Qwen2VLImageProcessorFast:
++        return self.get_hf_processor(**kwargs).image_processor
++
++    def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor:
++        return self.get_hf_processor(**kwargs).video_processor
++
++    def _get_vision_info(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int = 2,
++        do_resize: bool = True,
++        image_processor: Optional[Qwen2VLImageProcessorFast],
++    ) -> tuple[ImageSize, int]:
++        if image_processor is None:
++            image_processor = self.get_image_processor()
++
++        hf_config = self.get_hf_config()
++        vision_config = hf_config.vision_config
++        patch_size = vision_config.patch_size
++        merge_size = vision_config.spatial_merge_size
++        temporal_patch_size = vision_config.temporal_patch_size
++
++        if do_resize:
++            resized_height, resized_width = smart_resize(
++                height=image_height,
++                width=image_width,
++                factor=patch_size * merge_size,
++                min_pixels=image_processor.size["shortest_edge"],
++                max_pixels=image_processor.size["longest_edge"],
++            )
++            preprocessed_size = ImageSize(width=resized_width,
++                                          height=resized_height)
++        else:
++            preprocessed_size = ImageSize(width=image_width,
++                                          height=image_height)
++
++        padded_num_frames = num_frames + num_frames % temporal_patch_size
++
++        grid_t = max(padded_num_frames // temporal_patch_size, 1)
++        grid_h = preprocessed_size.height // patch_size
++        grid_w = preprocessed_size.width // patch_size
++
++        num_patches = grid_t * grid_h * grid_w
++        num_vision_tokens = num_patches // (merge_size**2)
++
++        return preprocessed_size, num_vision_tokens
++
++    def _calculate_timestamps(self, indices: list[int] | torch.Tensor,
++                              video_fps: float, merge_size: int):
++        if not isinstance(indices, list):
++            indices = indices.tolist()
++        if len(indices) % merge_size != 0:
++            # don't update metadata's frames_indices directly
++            indices = indices + [indices[-1]
++                                 ] * (merge_size - len(indices) % merge_size)
++        timestamps = [idx / video_fps for idx in indices]
++        timestamps = [(timestamps[i] + timestamps[i + merge_size - 1]) / 2
++                      for i in range(0, len(timestamps), merge_size)]
++        return timestamps
++
++    def _get_video_second_idx(
++            self,
++            metadata: dict[str, Any],
++            out_item: MultiModalKwargsItem,
++            do_sample_frames: Optional[bool] = None,
++            sampled_fps: Optional[float] = None) -> list[int]:
++        video_processor = self.get_video_processor()
++        merge_size = video_processor.merge_size
++        indices = metadata["frames_indices"]
++
++        # metadata["fps"] refers to the true fps of the input video.
++        video_fps = metadata["fps"]
++        if do_sample_frames is None:
++            do_sample_frames = metadata.get("do_sample_frames", False)
++
++        # If video frames are sampled in HF processor (instead of vLLM
++        # video loader), we need to re-calculate the indices from original
++        # metadata.
++        if do_sample_frames:
++            # here video_fps is the fps of the sampled video, and
++            # metadata["fps"] refers to the fps of the original video.
++            video_fps = sampled_fps if sampled_fps else video_processor.fps
++            total_num_frames = metadata["total_num_frames"]
++            num_frames = int(total_num_frames / metadata["fps"] * video_fps)
++            num_frames = min(
++                min(max(num_frames, video_processor.min_frames),
++                    video_processor.max_frames), total_num_frames)
++            indices = np.linspace(0, total_num_frames - 1,
++                                  num_frames).round().astype(int).tolist()
++        timestamps = self._calculate_timestamps(indices, video_fps, merge_size)
++        return timestamps
++
++
++class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
++
++    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
++
++        image_token = "<|vision_start|><|image_pad|><|vision_end|>"
++        video_token = "<|vision_start|><|video_pad|><|vision_end|>"
++
++        return image_token * num_images + video_token * num_videos
++
++    def get_dummy_mm_data(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> MultiModalDataDict:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
++
++        target_width, target_height = (
++            self.info.get_image_size_with_most_features())
++        target_num_frames = self.info.get_num_frames_with_most_features(
++            seq_len, mm_counts)
++        return {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++            "video":
++            self._get_dummy_videos(
++                width=target_width,
++                height=target_height,
++                num_frames=target_num_frames,
++                num_videos=num_videos,
++            ),
++        }
++
++    def _get_dummy_videos(
++        self,
++        *,
++        width: int,
++        height: int,
++        num_frames: int,
++        num_videos: int,
++    ) -> list[VideoItem]:
++        num_frames = max(num_frames, 2)
++        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
++        video_items = []
++        for i in range(num_videos):
++            video_metadata = {
++                "fps": 2.0,
++                "duration": num_frames / 2.0,
++                "total_num_frames": num_frames,
++                "frames_indices": [i for i in range(num_frames)],
++                "video_backend": "opencv",
++                "do_sample_frames": False,
++            }
++            video_item = (video.copy(), video_metadata)
++            video_items.append(video_item)
++        return video_items
++
++
++class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
++                                 ):
++
++    def _get_data_parser(self) -> MultiModalDataParser:
++        return MultiModalDataParser(video_needs_metadata=True)
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++        tok_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        processor = self.info.get_hf_processor(**mm_kwargs)
++
++        # Separate video processing from image processing. Because the videos
++        # are processed into serval image patches
++        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
++                and len(mm_data["videos"]) > 0):
++            video_grid_thw_lst = []
++            pixel_values_videos_lst = []
++
++            for item_idx, item in enumerate(mm_data.pop("videos", [])):
++                video_array, metadata = item
++
++                # NOTE: @JJJYmmm new attr metadata.frames_indices indicates
++                # the sampled frames indices of pre-sampled videos, which is
++                # used to calculate the timestamps. Make sure that
++                # do_sample_frames in mm_kwargs is false for presampled videos.
++
++                # NOTE: a copy of is created to update do_sample_frames,
++                # otherwise mm_hash for the object will be incorrect.
++                video_mm_kwargs = dict(**mm_kwargs)
++                if "do_sample_frames" not in video_mm_kwargs:
++                    # qwen_vl_utils already has "do_sample_frames" in
++                    # mm_kwargs, don't overwrite it.
++                    video_mm_kwargs["do_sample_frames"] = metadata.get(
++                        "do_sample_frames", False)
++
++                metadata = VideoMetadata(**{
++                    k: metadata[k]
++                    for k in metadata if k != "do_sample_frames"
++                })
++
++                video_mm_data = dict()
++                video_mm_data["videos"] = [[video_array]]
++                video_mm_data["video_metadata"] = [[metadata]]
++
++                video_outputs = super()._call_hf_processor(
++                    prompt="<|vision_start|><|video_pad|><|vision_end|>",
++                    mm_data=video_mm_data,
++                    mm_kwargs=video_mm_kwargs,
++                    tok_kwargs=tok_kwargs,
++                )
++                input_ids = video_outputs.pop("input_ids")
++                video_placeholder = processor.tokenizer.batch_decode(
++                    input_ids)[0]
++                prompt = prompt.replace(
++                    "<|vision_start|><|video_pad|><|vision_end|>",
++                    video_placeholder,
++                    1,
++                )
++
++                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
++                pixel_values_videos_lst.append(
++                    video_outputs["pixel_values_videos"])
++            video_outputs = dict(
++                pixel_values_videos=torch.cat(pixel_values_videos_lst),
++                video_grid_thw=torch.cat(video_grid_thw_lst),
++            )
++        else:
++            video_outputs = dict()
++
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++            tok_kwargs=tok_kwargs,
++        )
++        combined_outputs = dict(
++            processed_outputs,
++            **video_outputs,
++        )
++        return BatchFeature(combined_outputs)
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
++        image_grid_sizes = image_grid_thw.prod(-1)
++
++        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
++        video_grid_sizes = video_grid_thw.prod(-1)
++
++        return dict(
++            pixel_values=MultiModalFieldConfig.flat_from_sizes(
++                "image", image_grid_sizes),
++            image_embeds=MultiModalFieldConfig.flat_from_sizes(
++                "image", image_grid_sizes),
++            image_grid_thw=MultiModalFieldConfig.batched("image"),
++            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
++                "video", video_grid_sizes),
++            video_embeds=MultiModalFieldConfig.flat_from_sizes(
++                "video", video_grid_sizes),
++            video_grid_thw=MultiModalFieldConfig.batched("video"),
++        )
++
++    def _get_prompt_updates(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargsItems,
++    ) -> Sequence[PromptUpdate]:
++        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        image_processor = self.info.get_image_processor(
++            **hf_processor_mm_kwargs)
++        tokenizer = self.info.get_tokenizer()
++        hf_config = self.info.get_hf_config()
++
++        video_token_id = hf_config.video_token_id
++        vision_start_token_id = hf_config.vision_start_token_id
++        vision_end_token_id = hf_config.vision_end_token_id
++
++        merge_length = image_processor.merge_size**2
++
++        def get_image_replacement_qwen3vl(item_idx: int):
++            out_item = out_mm_kwargs["image"][item_idx]
++            grid_thw = out_item["image_grid_thw"].data
++            assert isinstance(grid_thw, torch.Tensor)
++
++            num_tokens = int(grid_thw.prod()) // merge_length
++            return [hf_processor.image_token_id] * num_tokens
++
++        def get_video_replacement_qwen3vl(item_idx: int):
++            out_item = out_mm_kwargs["video"][item_idx]
++            grid_thw = out_item["video_grid_thw"].data
++            assert isinstance(grid_thw, torch.Tensor)
++
++            video, metadata = mm_items["video"][item_idx]
++            do_sample_frames = hf_processor_mm_kwargs.get("do_sample_frames")
++            sampled_fps = hf_processor_mm_kwargs.get("fps")
++            if is_list_of(sampled_fps, float):
++                sampled_fps = sampled_fps[item_idx]
++            timestamps = self.info._get_video_second_idx(
++                metadata, out_item, do_sample_frames, sampled_fps)
++
++            assert len(timestamps) == grid_thw[0], (
++                f"The timestamps length({len(timestamps)}) should be equal "
++                f"video length ({grid_thw[0]}).")
++
++            frames_idx_token = [
++                tokenizer.encode(f"<{curr_time:.1f} seconds>",
++                                 add_special_tokens=False)
++                for curr_time in timestamps
++            ]
++            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
++            placeholder = []
++            for frame_idx in frames_idx_token:
++                placeholder.extend(frame_idx)
++                placeholder.extend([vision_start_token_id] +
++                                   [video_token_id] * num_tokens_per_frame +
++                                   [vision_end_token_id])
++            return PromptUpdateDetails.select_token_id(placeholder,
++                                                       video_token_id)
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=hf_processor.image_token,
++                replacement=get_image_replacement_qwen3vl,
++            ),
++
++            # NOTE: We match string on purpose since searching sequence of
++            # token ids takes more time.
++            PromptReplacement(
++                modality="video",
++                target="<|vision_start|><|video_pad|><|vision_end|>",
++                replacement=get_video_replacement_qwen3vl,
++            ),
++        ]
++
++
++@support_torch_compile(
++    dynamic_arg_dims={
++        "input_ids": 0,
++        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
++        # otherwise (seq_len, ).
++        "positions": -1,
++        "intermediate_tensors": 0,
++        "inputs_embeds": 0,
++        # the same shape as input_embeds
++        "deepstack_input_embeds": 0
++    })
++class Qwen3LLMModel(Qwen3Model):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        if not get_pp_group().is_first_rank:
++            assert self.start_layer >= len(
++                vllm_config.model_config.hf_config.vision_config.
++                deepstack_visual_indexes), (
++                    "start_layer should be greater than or equal to "
++                    "len(deepstack_visual_indexes)")
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        # args for deepstack
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for layer_idx, layer in enumerate(
++                self.layers[self.start_layer:self.end_layer]):
++            layer_idx = layer_idx + self.start_layer
++
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                residual,
++            )
++
++            if deepstack_input_embeds is not None and \
++                    layer_idx in range(0, len(deepstack_input_embeds)):
++                hidden_states = hidden_states + deepstack_input_embeds[
++                    f"deepstack_input_embeds_{layer_idx}"]
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3ForCausalLM, self).__init__()
++        config = vllm_config.model_config.hf_config.text_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.quant_config = quant_config
++        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
++
++        if get_pp_group().is_last_rank:
++            if config.tie_word_embeddings:
++                self.lm_head = self.model.embed_tokens
++            else:
++                self.lm_head = ParallelLMHead(config.vocab_size,
++                                              config.hidden_size,
++                                              quant_config=quant_config,
++                                              prefix="lm_head")
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++
++@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
++                                        info=Qwen3VLProcessingInfo,
++                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
++class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                      SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # To ensure correct weight loading and mapping.
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "model.visual.": "visual.",
++            "lm_head.": "language_model.lm_head.",
++            "model.language_model.": "language_model.model.",
++        })
++
++    @classmethod
++    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
++        if modality.startswith("image"):
++            return "<|vision_start|><|image_pad|><|vision_end|>"
++        if modality.startswith("video"):
++            return "<|vision_start|><|video_pad|><|vision_end|>"
++
++        raise ValueError("Only image or video modality is supported")
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
++        super().__init__()
++        config: Qwen3VLConfig = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.visual = Qwen3_VisionTransformer(
++            config.vision_config,
++            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
++            quant_config=self._maybe_ignore_quant_config(quant_config),
++            prefix=maybe_prefix(prefix, "visual"),
++        )
++
++        self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config,
++                                                  prefix=maybe_prefix(
++                                                      prefix,
++                                                      "language_model"))
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++        self.use_deepstack = hasattr(config.vision_config,
++                                     'deepstack_visual_indexes')
++        self.deepstack_num_level = len(
++            config.vision_config.deepstack_visual_indexes
++        ) if self.use_deepstack else 0
++        # register buffer for deepstack
++        self.deepstack_input_embeds = [
++            torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
++                        config.text_config.hidden_size)
++            for _ in range(self.deepstack_num_level)
++        ] if self.use_deepstack else None
++
++    def _get_deepstack_input_embeds(self,
++                                    num_tokens: int) -> IntermediateTensors:
++        # get deepstack_input_embeds from buffer, and clear the buffer
++        return IntermediateTensors({
++            f"deepstack_input_embeds_{idx}":
++            self.deepstack_input_embeds[idx][:num_tokens]
++            for idx in range(self.deepstack_num_level)
++        })
++
++    def _set_deepstack_input_embeds(
++            self, deepstack_input_embeds: torch.Tensor) -> None:
++        # set deepstack_input_embeds to buffer
++        num_tokens = deepstack_input_embeds.size(1)
++        if num_tokens > self.deepstack_input_embeds[0].size(0):
++            self.deepstack_input_embeds = [
++                torch.zeros(num_tokens,
++                            self.config.text_config.hidden_size,
++                            device=self.deepstack_input_embeds[0].device,
++                            dtype=self.deepstack_input_embeds[0].dtype)
++                for _ in range(self.deepstack_num_level)
++            ]
++        for idx in range(self.deepstack_num_level):
++            self.deepstack_input_embeds[idx][:num_tokens].copy_(
++                deepstack_input_embeds[idx])
++
++    def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
++        # clear deepstack_input_embeds in buffer
++        if num_tokens > 0:
++            for idx in range(self.deepstack_num_level):
++                self.deepstack_input_embeds[idx][:num_tokens].zero_()
++
++    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
++        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
++        # seems to avoid vision encoder sections for some models.
++        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
++            return None
++        return quant_config
++
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            if mm_input.ndim == 2:
++                return mm_input
++            if mm_input.ndim != 3:
++                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
++                                 f"Got ndim: {mm_input.ndim} "
++                                 f"(shape={mm_input.shape})")
++            return torch.concat(list(mm_input))
++        else:
++            return torch.concat(mm_input)
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++        image_grid_thw = kwargs.pop("image_grid_thw", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            pixel_values = self._validate_and_reshape_mm_tensor(
++                pixel_values, "image pixel values")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
++                                              pixel_values=pixel_values,
++                                              image_grid_thw=image_grid_thw)
++
++        if image_embeds is not None:
++            image_embeds = self._validate_and_reshape_mm_tensor(
++                image_embeds, "image embeds")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++            return Qwen2_5_VLImageEmbeddingInputs(
++                type="image_embeds",
++                image_embeds=image_embeds,
++                image_grid_thw=image_grid_thw)
++
++    def _parse_and_validate_video_input(
++            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
++        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
++        video_embeds = kwargs.pop("video_embeds", None)
++        video_grid_thw = kwargs.pop("video_grid_thw", None)
++        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
++
++        if pixel_values_videos is None and video_embeds is None:
++            return None
++
++        if pixel_values_videos is not None:
++            pixel_values_videos = self._validate_and_reshape_mm_tensor(
++                pixel_values_videos, "video pixel values")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
++
++            return Qwen2_5_VLVideoPixelInputs(
++                type="pixel_values_videos",
++                pixel_values_videos=pixel_values_videos,
++                video_grid_thw=video_grid_thw,
++                second_per_grid_ts=second_per_grid_ts,
++            )
++
++        if video_embeds is not None:
++            video_embeds = self._validate_and_reshape_mm_tensor(
++                video_embeds, "video embeds")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
++
++            if not isinstance(video_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of video embeddings. "
++                                 f"Got type: {type(video_embeds)}")
++            return Qwen2_5_VLVideoEmbeddingInputs(
++                type="video_embeds",
++                video_embeds=video_embeds,
++                video_grid_thw=video_grid_thw)
++
++    def _process_image_input(
++            self,
++            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
++
++        grid_thw = image_input["image_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
++
++        if image_input["type"] == "image_embeds":
++            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
++        else:
++            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
++            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
++
++        # Split concatenated embeddings for each image item.
++        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
++        merge_size = self.visual.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
++        return image_embeds.split(sizes)
++
++    def _process_video_input(
++            self,
++            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
++
++        grid_thw = video_input["video_grid_thw"]
++        assert grid_thw.ndim == 2
++        grid_thw_list = grid_thw.tolist()
++
++        if video_input["type"] == "video_embeds":
++            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
++        else:
++            pixel_values_videos = video_input["pixel_values_videos"].type(
++                self.visual.dtype)
++            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
++
++        # Split concatenated embeddings for each video item.
++        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
++        merge_size = self.visual.spatial_merge_size
++        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
++                 (merge_size * merge_size)).tolist()
++        return video_embeds.split(sizes)
++
++    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
++        mm_input_by_modality = {}
++        for input_key in kwargs:
++            if input_key in ("pixel_values", "image_embeds"
++                             ) and "image" not in mm_input_by_modality:
++                mm_input_by_modality[
++                    "image"] = self._parse_and_validate_image_input(**kwargs)
++            if input_key in ("pixel_values_videos", "video_embeds"
++                             ) and "video" not in mm_input_by_modality:
++                mm_input_by_modality[
++                    "video"] = self._parse_and_validate_video_input(**kwargs)
++        return mm_input_by_modality
++
++    def get_language_model(self) -> torch.nn.Module:
++        return self.language_model
++
++    def get_multimodal_embeddings(
++            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
++
++        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
++            **kwargs)
++        if not mm_input_by_modality:
++            return None
++
++        # The result multimodal_embeddings is tuple of tensors, with each
++        # tensor correspoending to a multimodal data item (image or video).
++        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
++
++        # NOTE: It is important to iterate over the keys in this dictionary
++        # to preserve the order of the modalities.
++        for modality in mm_input_by_modality:
++            multimodal_input = mm_input_by_modality[modality]
++            if modality == "image":
++                vision_embeddings = self._process_image_input(multimodal_input)
++                multimodal_embeddings += vision_embeddings
++            if modality == "video":
++                video_embeddings = self._process_video_input(multimodal_input)
++                multimodal_embeddings += video_embeddings
++        return multimodal_embeddings
++
++    def _compute_deepstack_embeds(
++            self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor,
++            multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor:
++        visual_lens = [
++            x.shape[0] if isinstance(x, torch.Tensor) else len(x)
++            for x in multimodal_embeddings
++        ]
++        multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0)
++
++        visual_dim = multimodal_embeddings_cat.shape[-1] // (
++            self.deepstack_num_level + 1)
++
++        main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level
++        multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split(  # noqa:E501
++            multimodal_embeddings_cat, [main_dim, multi_dim],
++            dim=-1)
++
++        multimodal_embeddings = torch.split(multimodal_embeddings_main,
++                                            visual_lens,
++                                            dim=0)
++        multimodal_embeddings_multiscale = torch.split(
++            multimodal_embeddings_multiscale, visual_lens, dim=0)
++
++        deepstack_input_embeds = inputs_embeds.new_zeros(
++            inputs_embeds.size(0),
++            self.deepstack_num_level * inputs_embeds.size(1))
++
++        deepstack_input_embeds = merge_multimodal_embeddings(
++            input_ids,
++            deepstack_input_embeds,
++            multimodal_embeddings_multiscale,
++            placeholder_token_id=[
++                self.config.image_token_id, self.config.video_token_id
++            ],
++        )
++        deepstack_input_embeds = deepstack_input_embeds.view(
++            inputs_embeds.shape[0], self.deepstack_num_level,
++            visual_dim).contiguous()
++        deepstack_input_embeds = deepstack_input_embeds.permute(
++            1, 0, 2).contiguous()
++        return deepstack_input_embeds, multimodal_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
++    ) -> torch.Tensor:
++        deepstack_input_embeds = None
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None and self.use_deepstack:
++            deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds(  # noqa:E501
++                input_ids, inputs_embeds, multimodal_embeddings)
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                [self.config.image_token_id, self.config.video_token_id])
++
++        if self.use_deepstack:
++            if deepstack_input_embeds is None:
++                deepstack_input_embeds = torch.zeros_like(
++                    inputs_embeds).unsqueeze(0).repeat(
++                        self.deepstack_num_level, 1, 1).contiguous()
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
++
++        return inputs_embeds
++
++    def get_input_embeddings_v0(
++        self,
++        input_ids: torch.Tensor,
++        image_input: Optional[Qwen2_5_VLImageInputs] = None,
++        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.get_input_embeddings(input_ids)
++
++        if self.use_deepstack:
++            visual_dim = inputs_embeds.shape[-1]
++            deepstack_input_embeds = None
++            if image_input is not None or video_input is not None:
++                deepstack_input_embeds = torch.zeros_like(
++                    inputs_embeds).unsqueeze(1).repeat(
++                        1, self.deepstack_num_level, 1).flatten(1)
++
++        if image_input is not None:
++            image_embeds = self._process_image_input(image_input)
++            if self.use_deepstack:
++                image_embeds = torch.cat(image_embeds)
++
++                image_embeds, image_embeds_multiscale = image_embeds.split(
++                    [visual_dim, visual_dim * self.deepstack_num_level],
++                    dim=-1)
++
++                deepstack_input_embeds = merge_multimodal_embeddings(
++                    input_ids,
++                    deepstack_input_embeds,
++                    image_embeds_multiscale,
++                    placeholder_token_id=self.config.image_token_id,
++                )
++
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids,
++                inputs_embeds,
++                image_embeds,
++                placeholder_token_id=self.config.image_token_id,
++            )
++
++        if video_input is not None:
++            video_embeds = self._process_video_input(video_input)
++            if self.use_deepstack:
++                video_embeds = torch.cat(video_embeds)
++
++                video_embeds, video_embeds_multiscale = video_embeds.split(
++                    [visual_dim, visual_dim * self.deepstack_num_level],
++                    dim=-1)
++
++                deepstack_input_embeds = merge_multimodal_embeddings(
++                    input_ids,
++                    deepstack_input_embeds,
++                    video_embeds_multiscale,
++                    placeholder_token_id=self.config.video_token_id,
++                )
++
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids,
++                inputs_embeds,
++                video_embeds,
++                placeholder_token_id=self.config.video_token_id,
++            )
++
++        if self.use_deepstack and deepstack_input_embeds is not None:
++            deepstack_input_embeds = deepstack_input_embeds.view(
++                inputs_embeds.shape[0], self.deepstack_num_level,
++                visual_dim).permute(1, 0, 2).contiguous()
++            self._set_deepstack_input_embeds(deepstack_input_embeds)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for Qwen3VL.
++
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            positions: Flattened (concatenated) position ids corresponding to a
++                batch.
++                **NOTE**: If mrope is enabled (default setting for Qwen3VL
++                opensource models), the shape will be `(3, seq_len)`,
++                otherwise it will be `(seq_len,).
++            pixel_values: Pixel values to be fed to a model.
++                `None` if no images are passed.
++            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
++                `None` if no images are passed.
++            pixel_values_videos: Pixel values of videos to be fed to a model.
++                `None` if no videos are passed.
++            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
++                `None` if no videos are passed.
++        """
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner from
++        # `get_multimodal_embeddings` and `get_input_embeddings`, this
++        # condition is only for v0 compatibility.
++        elif inputs_embeds is None:
++            image_input = self._parse_and_validate_image_input(**kwargs)
++            video_input = self._parse_and_validate_video_input(**kwargs)
++
++            if image_input is None and video_input is None:
++                inputs_embeds = None
++            else:
++                if uses_mrope(self.config):
++                    assert positions.ndim == 2 and positions.size(0) == 3, (
++                        "multimodal section rotary embedding requires "
++                        f"(3, seq_len) positions, but got {positions.size()}")
++                inputs_embeds = self.get_input_embeddings_v0(
++                    input_ids,
++                    image_input=image_input,
++                    video_input=video_input)
++                input_ids = None
++
++        if self.use_deepstack and inputs_embeds is not None and get_pp_group(
++        ).is_first_rank:
++            deepstack_input_embeds = self._get_deepstack_input_embeds(
++                inputs_embeds.size(0))
++        else:
++            deepstack_input_embeds = None
++
++        hidden_states = self.language_model.model(
++            input_ids=input_ids,
++            positions=positions,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++            # args for deepstack
++            deepstack_input_embeds=deepstack_input_embeds,
++        )
++
++        if inputs_embeds is not None and get_pp_group().is_first_rank:
++            self._clear_deepstack_input_embeds(inputs_embeds.size(0))
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def load_weights(self, weights: Iterable[tuple[str,
++                                                   torch.Tensor]]) -> set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="language_model",
++            connector="model.visual.merger",
++            tower_model="model.visual.",
++        )
+diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
 new file mode 100644
-index 000000000..34a87a6a6
+index 000000000..a800e94ab
 --- /dev/null
-+++ b/vllm/model_executor/models/seed_oss.py
-@@ -0,0 +1,487 @@
++++ b/vllm/model_executor/models/qwen3_vl_moe.py
+@@ -0,0 +1,344 @@
 +# SPDX-License-Identifier: Apache-2.0
 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 +
-+# Copyright 2025 The Seed team.
-+# Copyright 2023 The vLLM team.
-+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++# Copyright 2025 The vLLM team.
++# Copyright 2025 The Qwen Team.
++# Copyright 2025 The HuggingFace Inc. team.
++# All rights reserved.
 +#
 +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 +# and OPT implementations in this library. It has been modified from its
@@ -15548,307 +15560,62 @@ index 000000000..34a87a6a6
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
-+"""Inference-only SeedOss model compatible with HuggingFace weights."""
++"""Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights."""
++import typing
 +from collections.abc import Iterable
-+from typing import Optional, Union
++from typing import Callable, Optional, Union
 +
 +import torch
-+from torch import nn
-+from transformers import PretrainedConfig as SeedOssConfig
++from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
++    Qwen3VLMoeConfig)
 +
-+from vllm.attention import Attention, AttentionType
 +from vllm.compilation.decorators import support_torch_compile
-+from vllm.config import CacheConfig, VllmConfig
-+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
 +from vllm.logger import init_logger
-+from vllm.model_executor.layers.activation import SiluAndMul
-+from vllm.model_executor.layers.layernorm import RMSNorm
-+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-+                                               QKVParallelLinear,
-+                                               RowParallelLinear)
 +from vllm.model_executor.layers.logits_processor import LogitsProcessor
-+from vllm.model_executor.layers.quantization import QuantizationConfig
-+from vllm.model_executor.layers.rotary_embedding import get_rope
-+from vllm.model_executor.layers.vocab_parallel_embedding import (
-+    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 +from vllm.model_executor.model_loader.weight_utils import (
 +    default_weight_loader, maybe_remap_kv_scale_name)
-+from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
 +from vllm.sequence import IntermediateTensors
 +
-+from .interfaces import SupportsLoRA, SupportsPP
-+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-+                    make_empty_intermediate_tensors_factory, make_layers,
-+                    maybe_prefix)
++from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
++from .qwen3_vl import (Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder,
++                       Qwen3VLForConditionalGeneration,
++                       Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo)
++from .utils import is_pp_missing_parameter, maybe_prefix
 +
 +logger = init_logger(__name__)
 +
 +
-+class SeedOssMLP(nn.Module):
-+
-+    def __init__(
-+        self,
-+        hidden_size: int,
-+        intermediate_size: int,
-+        hidden_act: str,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> None:
-+        super().__init__()
-+        self.gate_up_proj = MergedColumnParallelLinear(
-+            hidden_size,
-+            [intermediate_size] * 2,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.gate_up_proj",
-+        )
-+        self.down_proj = RowParallelLinear(
-+            intermediate_size,
-+            hidden_size,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.down_proj",
-+        )
-+        if hidden_act != "silu":
-+            raise ValueError(f"Unsupported activation: {hidden_act}. "
-+                             "Only silu is supported for now.")
-+        self.act_fn = SiluAndMul()
-+
-+    def forward(self, x):
-+        gate_up, _ = self.gate_up_proj(x)
-+        x = self.act_fn(gate_up)
-+        x, _ = self.down_proj(x)
-+        return x
-+
-+
-+class SeedOssAttention(nn.Module):
-+
-+    def __init__(
-+        self,
-+        hidden_size: int,
-+        num_heads: int,
-+        num_kv_heads: int,
-+        head_dim: int,
-+        max_position: int = 4096 * 32,
-+        rope_theta: float = 10000,
-+        cache_config: Optional[CacheConfig] = None,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        rope_scaling: Optional[tuple] = None,
-+        prefix: str = "",
-+        attn_type: str = AttentionType.DECODER,
-+    ) -> None:
-+        super().__init__()
-+        self.hidden_size = hidden_size
-+        tp_size = get_tensor_model_parallel_world_size()
-+        self.total_num_heads = num_heads
-+        assert self.total_num_heads % tp_size == 0
-+        self.num_heads = self.total_num_heads // tp_size
-+        self.total_num_kv_heads = num_kv_heads
-+        self.head_dim = head_dim
-+        if self.total_num_kv_heads >= tp_size:
-+            # Number of KV heads is greater than TP size, so we partition
-+            # the KV heads across multiple tensor parallel GPUs.
-+            assert self.total_num_kv_heads % tp_size == 0
-+        else:
-+            # Number of KV heads is less than TP size, so we replicate
-+            # the KV heads across multiple tensor parallel GPUs.
-+            assert tp_size % self.total_num_kv_heads == 0
-+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-+        self.q_size = self.num_heads * self.head_dim
-+        self.kv_size = self.num_kv_heads * self.head_dim
-+        self.scaling = self.head_dim**-0.5
-+        self.rope_theta = rope_theta
-+
-+        self.qkv_proj = QKVParallelLinear(
-+            hidden_size,
-+            self.head_dim,
-+            self.total_num_heads,
-+            self.total_num_kv_heads,
-+            bias=True,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.qkv_proj",
-+        )
-+        self.o_proj = RowParallelLinear(
-+            self.total_num_heads * self.head_dim,
-+            hidden_size,
-+            bias=False,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.o_proj",
-+        )
-+
-+        self.rotary_emb = get_rope(
-+            self.head_dim,
-+            rotary_dim=self.head_dim,
-+            max_position=max_position,
-+            base=self.rope_theta,
-+            rope_scaling=rope_scaling,
-+        )
-+        self.attn = Attention(
-+            self.num_heads,
-+            self.head_dim,
-+            self.scaling,
-+            num_kv_heads=self.num_kv_heads,
-+            cache_config=cache_config,
-+            quant_config=quant_config,
-+            attn_type=attn_type,
-+            prefix=f"{prefix}.attn",
-+        )
-+
-+    def forward(
-+        self,
-+        positions: torch.Tensor,
-+        hidden_states: torch.Tensor,
-+    ) -> torch.Tensor:
-+        qkv, _ = self.qkv_proj(hidden_states)
-+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-+        q, k = self.rotary_emb(positions, q, k)
-+        attn_output = self.attn(q, k, v)
-+        output, _ = self.o_proj(attn_output)
-+        return output
-+
-+
-+class SeedOssDecoderLayer(nn.Module):
-+
-+    def __init__(
-+        self,
-+        config: SeedOssConfig,
-+        cache_config: Optional[CacheConfig] = None,
-+        quant_config: Optional[QuantizationConfig] = None,
-+        prefix: str = "",
-+    ) -> None:
-+        super().__init__()
-+        self.hidden_size = config.hidden_size
-+        # Requires transformers > 4.32.0
-+        rope_theta = getattr(config, "rope_theta", 1000000)
-+        rope_scaling = getattr(config, "rope_scaling", None)
-+
-+        # By default, SeedOss uses causal attention as it is a
-+        # decoder-only model.
-+        # You can override the HF config with `is_causal=False` to enable
-+        # bidirectional attention, which is used in some embedding models
-+        if getattr(config, "is_causal", True):
-+            attn_type = AttentionType.DECODER
-+        else:
-+            attn_type = AttentionType.ENCODER_ONLY
-+
-+        self.self_attn = SeedOssAttention(
-+            hidden_size=self.hidden_size,
-+            num_heads=config.num_attention_heads,
-+            max_position=config.max_position_embeddings,
-+            num_kv_heads=config.num_key_value_heads,
-+            head_dim=config.head_dim,
-+            rope_theta=rope_theta,
-+            cache_config=cache_config,
-+            quant_config=quant_config,
-+            rope_scaling=rope_scaling,
-+            prefix=f"{prefix}.self_attn",
-+            attn_type=attn_type,
-+        )
-+        self.mlp = SeedOssMLP(
-+            hidden_size=self.hidden_size,
-+            intermediate_size=config.intermediate_size,
-+            hidden_act=config.hidden_act,
-+            quant_config=quant_config,
-+            prefix=f"{prefix}.mlp",
-+        )
-+        self.input_layernorm = RMSNorm(config.hidden_size,
-+                                       eps=config.rms_norm_eps)
-+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-+                                                eps=config.rms_norm_eps)
++class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
 +
-+    def forward(
-+        self,
-+        positions: torch.Tensor,
-+        hidden_states: torch.Tensor,
-+        residual: Optional[torch.Tensor],
-+    ) -> tuple[torch.Tensor, torch.Tensor]:
-+        # Self Attention
-+        if residual is None:
-+            residual = hidden_states
-+            hidden_states = self.input_layernorm(hidden_states)
-+        else:
-+            hidden_states, residual = self.input_layernorm(
-+                hidden_states, residual)
-+        hidden_states = self.self_attn(
-+            positions=positions,
-+            hidden_states=hidden_states,
-+        )
-+
-+        # Fully Connected
-+        hidden_states, residual = self.post_attention_layernorm(
-+            hidden_states, residual)
-+        hidden_states = self.mlp(hidden_states)
-+        return hidden_states, residual
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen3VLMoeConfig)
 +
 +
 +@support_torch_compile(
 +    dynamic_arg_dims={
 +        "input_ids": 0,
++        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
++        # otherwise (seq_len, ).
 +        "positions": -1,
 +        "intermediate_tensors": 0,
 +        "inputs_embeds": 0,
++        # the same shape as input_embeds
++        "deepstack_input_embeds": 0
 +    })
-+class SeedOssModel(nn.Module):
-+
-+    def __init__(self,
-+                 *,
-+                 vllm_config: VllmConfig,
-+                 prefix: str = "",
-+                 decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer):
-+        super().__init__()
-+
-+        config = vllm_config.model_config.hf_config
-+        cache_config = vllm_config.cache_config
-+        quant_config = vllm_config.quant_config
++class Qwen3MoeLLMModel(Qwen3MoeModel):
 +
-+        # TODO (@robertgshaw2): see if this can be moved out
-+        if (cache_config.sliding_window is not None
-+                and hasattr(config, "max_window_layers")):
-+            assert config.max_window_layers == config.num_hidden_layers, (
-+                "Sliding window for some but all layers is not supported. "
-+                "This model uses sliding window but `max_window_layers` = {} "
-+                "is less than `num_hidden_layers` = {}. Please open an issue "
-+                "to discuss this feature.".format(
-+                    config.max_window_layers,
-+                    config.num_hidden_layers,
-+                ))
-+
-+        self.config = config
-+        self.quant_config = quant_config
-+        self.vocab_size = config.vocab_size
-+
-+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-+                                            and get_pp_group().is_last_rank):
-+            self.embed_tokens = VocabParallelEmbedding(
-+                config.vocab_size,
-+                config.hidden_size,
-+                quant_config=quant_config,
-+                prefix=f"{prefix}.embed_tokens",
-+            )
-+        else:
-+            self.embed_tokens = PPMissingLayer()
-+
-+        # Use the provided decoder layer type or default to SeedDecoderLayer
-+        decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer
-+        self.start_layer, self.end_layer, self.layers = make_layers(
-+            config.num_hidden_layers,
-+            lambda prefix: decoder_layer_type(config=config,
-+                                              cache_config=cache_config,
-+                                              quant_config=quant_config,
-+                                              prefix=prefix),
-+            prefix=f"{prefix}.layers",
-+        )
-+
-+        self.make_empty_intermediate_tensors = (
-+            make_empty_intermediate_tensors_factory(
-+                ["hidden_states", "residual"], config.hidden_size))
-+        if get_pp_group().is_last_rank:
-+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-+        else:
-+            self.norm = PPMissingLayer()
-+
-+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-+        return self.embed_tokens(input_ids)
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        if not get_pp_group().is_first_rank:
++            assert self.start_layer >= len(
++                vllm_config.model_config.hf_config.vision_config.
++                deepstack_visual_indexes), (
++                    "start_layer should be greater than or equal to "
++                    "len(deepstack_visual_indexes)")
 +
 +    def forward(
 +        self,
@@ -15856,6 +15623,7 @@ index 000000000..34a87a6a6
 +        positions: torch.Tensor,
 +        intermediate_tensors: Optional[IntermediateTensors] = None,
 +        inputs_embeds: Optional[torch.Tensor] = None,
++        deepstack_input_embeds: Optional[IntermediateTensors] = None,
 +    ) -> Union[torch.Tensor, IntermediateTensors]:
 +        if get_pp_group().is_first_rank:
 +            if inputs_embeds is not None:
@@ -15867,12 +15635,21 @@ index 000000000..34a87a6a6
 +            assert intermediate_tensors is not None
 +            hidden_states = intermediate_tensors["hidden_states"]
 +            residual = intermediate_tensors["residual"]
-+        for layer in self.layers[self.start_layer:self.end_layer]:
++        for layer_idx, layer in enumerate(
++                self.layers[self.start_layer:self.end_layer]):
++            layer_idx = layer_idx + self.start_layer
++
 +            hidden_states, residual = layer(
 +                positions,
 +                hidden_states,
 +                residual,
 +            )
++
++            if deepstack_input_embeds is not None and \
++                    layer_idx in range(0, len(deepstack_input_embeds)):
++                hidden_states = hidden_states + deepstack_input_embeds[
++                    f"deepstack_input_embeds_{layer_idx}"]
++
 +        if not get_pp_group().is_last_rank:
 +            return IntermediateTensors({
 +                "hidden_states": hidden_states,
@@ -15881,6 +15658,23 @@ index 000000000..34a87a6a6
 +        hidden_states, _ = self.norm(hidden_states, residual)
 +        return hidden_states
 +
++    def load_fused_expert_weights(self, name: str, params_dict: dict,
++                                  loaded_weight: torch.Tensor, shard_id: str,
++                                  num_experts: int):
++        param = params_dict[name]
++        weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
++        for expert_id in range(num_experts):
++            curr_expert_weight = loaded_weight[expert_id]
++            success = weight_loader(param,
++                                    curr_expert_weight,
++                                    name,
++                                    shard_id,
++                                    expert_id,
++                                    return_success=True)
++            if not success:
++                return False
++        return True
++
 +    def load_weights(self, weights: Iterable[tuple[str,
 +                                                   torch.Tensor]]) -> set[str]:
 +        stacked_params_mapping = [
@@ -15891,127 +15685,260 @@ index 000000000..34a87a6a6
 +            ("gate_up_proj", "gate_proj", 0),
 +            ("gate_up_proj", "up_proj", 1),
 +        ]
-+        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        # Skip loading extra parameters for GPTQ/modelopt models.
++        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
++                           ".v_scale", "_v_scale", ".weight_scale",
++                           "_weight_scale", ".input_scale", "_input_scale")
++        params_dict = dict(self.named_parameters())
 +        loaded_params: set[str] = set()
++        expert_params_mapping = self.get_expert_mapping()
++        is_fused_expert = False
++        fused_expert_params_mapping = [
++            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
++            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
++        ]
++        num_experts = self.config.num_experts
 +        for name, loaded_weight in weights:
-+            if "rotary_emb.inv_freq" in name:
-+                continue
-+            if (self.quant_config is not None and
-+                (scale_name := self.quant_config.get_cache_scale(name))):
-+                # Loading kv cache quantization scales
-+                param = params_dict[scale_name]
-+                weight_loader = getattr(param, "weight_loader",
-+                                        default_weight_loader)
-+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-+                                 loaded_weight[0])
-+                weight_loader(param, loaded_weight)
-+                loaded_params.add(scale_name)
-+                continue
 +            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if ("experts.gate_up_proj" in name
++                        or "experts.down_proj" in name):
++                    is_fused_expert = True
++                    expert_params_mapping = fused_expert_params_mapping
++
++                # Skip non-stacked layers and experts (experts handled below).
 +                if weight_name not in name:
 +                    continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if "mlp.experts" in name:
++                    continue
 +                name = name.replace(weight_name, param_name)
-+                # Skip loading extra bias for GPTQ models.
-+                if name.endswith(".bias") and name not in params_dict:
++                # Skip loading extra parameters for GPTQ/modelopt models.
++                if name.endswith(ignore_suffixes) and name not in params_dict:
 +                    continue
++                # Skip layers on other devices.
 +                if is_pp_missing_parameter(name, self):
 +                    continue
-+                param = params_dict[name]
-+                weight_loader = param.weight_loader
-+                weight_loader(param, loaded_weight, shard_id)
-+                break
-+            else:
-+                # Skip loading extra bias for GPTQ models.
-+                if name.endswith(".bias") and name not in params_dict:
-+                    continue
-+                # Remapping the name of FP8 kv-scale.
-+                name = maybe_remap_kv_scale_name(name, params_dict)
-+                if name is None:
-+                    continue
-+                if is_pp_missing_parameter(name, self):
++                if name.endswith("scale"):
++                    # Remapping the name of FP8 kv-scale.
++                    name = maybe_remap_kv_scale_name(name, params_dict)
++                    if name is None:
++                        continue
++                if name not in params_dict:
 +                    continue
 +                param = params_dict[name]
 +                weight_loader = getattr(param, "weight_loader",
 +                                        default_weight_loader)
-+                weight_loader(param, loaded_weight)
++                if weight_loader == default_weight_loader:
++                    weight_loader(param, loaded_weight)
++                else:
++                    weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                is_expert_weight = False
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    # Anyway, this is an expert weight and should not be
++                    # attempted to load as other weights later
++                    is_expert_weight = True
++                    name_mapped = name.replace(weight_name, param_name)
++                    if is_fused_expert:
++                        loaded_weight = loaded_weight.transpose(-1,
++                                                                -2)  # no bias
++                        if "experts.gate_up_proj" in name:
++                            loaded_weight = loaded_weight.chunk(2, dim=-2)
++                            success_w1 = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight[0],
++                                "w1", num_experts)
++                            success_w3 = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight[1],
++                                "w3", num_experts)
++                            success = success_w1 and success_w3
++                        else:
++                            # down_proj
++                            success = self.load_fused_expert_weights(
++                                name_mapped, params_dict, loaded_weight,
++                                shard_id, num_experts)
++                    else:
++                        if is_pp_missing_parameter(name_mapped, self):
++                            continue
++                        # Skip loading extra parameters for GPTQ/modelopt models
++                        if name_mapped.endswith(
++                                ignore_suffixes
++                        ) and name_mapped not in params_dict:
++                            continue
++                        param = params_dict[name_mapped]
++                        # We should ask the weight loader to return success or
++                        # not here since otherwise we may skip experts with
++                        # other available replicas.
++                        weight_loader = typing.cast(Callable[..., bool],
++                                                    param.weight_loader)
++                        success = weight_loader(param,
++                                                loaded_weight,
++                                                name_mapped,
++                                                shard_id=shard_id,
++                                                expert_id=expert_id,
++                                                return_success=True)
++                    if success:
++                        name = name_mapped
++                        break
++                else:
++                    if is_expert_weight:
++                        # We've checked that this is an expert weight
++                        # However it's not mapped locally to this rank
++                        # So we simply skip it
++                        continue
++                    # Skip loading extra parameters for GPTQ/modelopt models.
++                    if name.endswith(
++                            ignore_suffixes) and name not in params_dict:
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    if name.endswith("kv_scale"):
++                        remapped_kv_scale_name = name.replace(
++                            ".kv_scale", ".attn.kv_scale")
++                        if remapped_kv_scale_name not in params_dict:
++                            logger.warning_once(
++                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
++                                name,
++                                remapped_kv_scale_name,
++                            )
++                            continue
++                        else:
++                            name = remapped_kv_scale_name
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
 +            loaded_params.add(name)
 +        return loaded_params
 +
 +
-+class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-+    packed_modules_mapping = {
-+        "qkv_proj": [
-+            "q_proj",
-+            "k_proj",
-+            "v_proj",
-+        ],
-+        "gate_up_proj": [
-+            "gate_proj",
-+            "up_proj",
-+        ],
-+    }
++class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
 +
 +    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-+        super().__init__()
-+        config = vllm_config.model_config.hf_config
-+        quant_config = vllm_config.quant_config
-+        lora_config = vllm_config.lora_config
-+
-+        self.config = config
-+        self.lora_config = lora_config
++        super(Qwen3MoeForCausalLM, self).__init__()
++        self.config = vllm_config.model_config.hf_config.text_config
++        self.quant_config = vllm_config.quant_config
++        self.model = Qwen3MoeLLMModel(vllm_config=vllm_config,
++                                      prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(self.config.vocab_size,
++                                      self.config.hidden_size,
++                                      quant_config=self.quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.logits_processor = LogitsProcessor(self.config.vocab_size)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
 +
-+        self.quant_config = quant_config
-+        self.model = SeedOssModel(vllm_config=vllm_config,
-+                                  prefix=maybe_prefix(prefix, "model"))
 +
-+        if get_pp_group().is_last_rank:
-+            if config.tie_word_embeddings:
-+                self.lm_head = self.model.embed_tokens
-+            else:
-+                self.lm_head = ParallelLMHead(config.vocab_size,
-+                                              config.hidden_size,
-+                                              quant_config=quant_config,
-+                                              prefix=maybe_prefix(
-+                                                  prefix, "lm_head"))
-+        else:
-+            self.lm_head = PPMissingLayer()
++@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
++                                        info=Qwen3VLMoeProcessingInfo,
++                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
++class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
 +
-+        self.logits_processor = LogitsProcessor(config.vocab_size)
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super(Qwen3VLForConditionalGeneration, self).__init__()
++        config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
 +
-+        self.make_empty_intermediate_tensors = (
-+            self.model.make_empty_intermediate_tensors)
++        self.config = config
++        self.multimodal_config = multimodal_config
 +
-+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-+        return self.model.get_input_embeddings(input_ids)
++        self.visual = Qwen3_VisionTransformer(
++            config.vision_config,
++            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
++            quant_config=self._maybe_ignore_quant_config(quant_config),
++            prefix=maybe_prefix(prefix, "visual"),
++        )
 +
-+    def forward(
-+        self,
-+        input_ids: torch.Tensor,
-+        positions: torch.Tensor,
-+        intermediate_tensors: Optional[IntermediateTensors] = None,
-+        inputs_embeds: Optional[torch.Tensor] = None,
-+    ) -> Union[torch.Tensor, IntermediateTensors]:
-+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-+                                   inputs_embeds)
-+        return hidden_states
++        self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,
++                                                     prefix=maybe_prefix(
++                                                         prefix,
++                                                         "language_model"))
 +
-+    def compute_logits(
-+        self,
-+        hidden_states: torch.Tensor,
-+        sampling_metadata: SamplingMetadata,
-+    ) -> Optional[torch.Tensor]:
-+        logits = self.logits_processor(self.lm_head, hidden_states,
-+                                       sampling_metadata)
-+        return logits
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
 +
-+    def load_weights(self, weights: Iterable[tuple[str,
-+                                                   torch.Tensor]]) -> set[str]:
-+        loader = AutoWeightsLoader(
-+            self,
-+            skip_prefixes=(["lm_head."]
-+                           if self.config.tie_word_embeddings else None),
-+        )
-+        return loader.load_weights(weights)
++        self.use_deepstack = hasattr(config.vision_config,
++                                     'deepstack_visual_indexes')
++        self.deepstack_num_level = len(
++            config.vision_config.deepstack_visual_indexes
++        ) if self.use_deepstack else 0
++        # register buffer for deepstack
++        self.deepstack_input_embeds = [
++            torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
++                        config.text_config.hidden_size)
++            for _ in range(self.deepstack_num_level)
++        ] if self.use_deepstack else None
+diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
+index 7d7654e84..81e460413 100644
+--- a/vllm/model_executor/models/registry.py
++++ b/vllm/model_executor/models/registry.py
+@@ -142,6 +142,7 @@ _TEXT_GENERATION_MODELS = {
+     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
+     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
++    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
+     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
+     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
+@@ -215,6 +216,7 @@ _MULTIMODAL_MODELS = {
+     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
+     "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
+     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
++    "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
+     "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
+     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
+@@ -256,14 +258,33 @@ _MULTIMODAL_MODELS = {
+     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
+     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
+     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
+-    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+-    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+-    "UltravoxModel": ("ultravox", "UltravoxModel"),
++    "Qwen2_5_VLForConditionalGeneration": (
++        "qwen2_5_vl",
++        "Qwen2_5_VLForConditionalGeneration",
++    ),
++    "Qwen2AudioForConditionalGeneration": (
++        "qwen2_audio",
++        "Qwen2AudioForConditionalGeneration",
++    ),
++    "Qwen2_5OmniModel": (
++        "qwen2_5_omni_thinker",
++        "Qwen2_5OmniThinkerForConditionalGeneration",
++    ),
++    "Qwen2_5OmniForConditionalGeneration": (
++        "qwen2_5_omni_thinker",
++        "Qwen2_5OmniThinkerForConditionalGeneration",
++    ),
++    "Qwen3OmniMoeForConditionalGeneration": (
++        "qwen3_omni_moe_thinker",
++        "Qwen3OmniMoeThinkerForConditionalGeneration",
++    ),
++    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
++    "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),  # noqa: E501
++    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
+     "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
+     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
+     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
++    "UltravoxModel": ("ultravox", "UltravoxModel"),
+     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
+     # [Encoder-decoder]
+     "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
 diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
 index 3630f59f5..62566d8f6 100644
 --- a/vllm/model_executor/models/siglip.py
@@ -16035,513 +15962,598 @@ index 3630f59f5..62566d8f6 100644
                                         self.head_dim, self.scale)
  
      def forward(
+diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
+index c6244fb3b..a86700fe6 100644
+--- a/vllm/model_executor/models/siglip2navit.py
++++ b/vllm/model_executor/models/siglip2navit.py
+@@ -13,6 +13,7 @@ from torch.nn import functional as F
+ from transformers import Siglip2VisionConfig
+ from transformers.configuration_utils import PretrainedConfig
+ 
++from vllm.attention.layer import check_upstream_fa_availability
+ from vllm.config import QuantizationConfig
+ from vllm.distributed import divide, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+@@ -236,7 +237,15 @@ class Siglip2Attention(nn.Module):
+         self.use_rope = config.use_rope
+ 
+         # Detect attention implementation.
+-        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        self.attn_backend = get_vit_attn_backend(
++            head_size=self.head_dim, dtype=torch.get_default_dtype())
++        self.use_upstream_fa = False
++        if self.attn_backend != _Backend.FLASH_ATTN and \
++            check_upstream_fa_availability(
++                torch.get_default_dtype()):
++            self.attn_backend = _Backend.FLASH_ATTN
++            self.use_upstream_fa = True
++
+         if self.attn_backend not in {
+                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                 _Backend.ROCM_AITER_FA
+@@ -280,7 +289,10 @@ class Siglip2Attention(nn.Module):
+             if self.attn_backend == _Backend.ROCM_AITER_FA:
+                 from aiter import flash_attn_varlen_func
+             else:
+-                from flash_attn import flash_attn_varlen_func
++                if self.use_upstream_fa:
++                    from flash_attn import flash_attn_varlen_func
++                else:
++                    from vllm.vllm_flash_attn import flash_attn_varlen_func
+             attn_output = flash_attn_varlen_func(
+                 queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                 max_seqlen).reshape(seq_length, -1)
 diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
-index ac6a659bb..fa25ee8e1 100644
+index c16aa5ac6..0b58caa3e 100644
 --- a/vllm/model_executor/models/vision.py
 +++ b/vllm/model_executor/models/vision.py
-@@ -97,6 +97,8 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-             else:
-                 # For Volta and Turing GPUs, use xformers instead.
-                 selected_backend = _Backend.XFORMERS
-+        elif current_platform.is_xpu:
-+            selected_backend = _Backend.IPEX
-         else:
-             # Default to torch SDPA for other non-GPU platforms.
-             selected_backend = _Backend.TORCH_SDPA
-diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
-index c4530c1df..6a1bb9a78 100644
---- a/vllm/platforms/xpu.py
-+++ b/vllm/platforms/xpu.py
-@@ -4,6 +4,7 @@
- import os
- from typing import TYPE_CHECKING, Optional
- 
-+import os
+@@ -7,7 +7,6 @@ from typing import Final, Generic, Optional, Protocol, TypeVar, Union
  import torch
+ from transformers import PretrainedConfig
+ 
+-from vllm.attention.selector import get_env_variable_attn_backend
+ from vllm.logger import init_logger
+ from vllm.platforms import _Backend, current_platform
  
- import vllm.envs as envs
-@@ -20,6 +21,23 @@ else:
+@@ -68,18 +67,18 @@ def get_vision_encoder_info(
+     raise NotImplementedError(msg)
  
- logger = init_logger(__name__)
  
-+def device_id_to_physical_device_id(device_id: int) -> int:
-+    if "ZE_AFFINITY_MASK" in os.environ:
-+        device_ids = os.environ["ZE_AFFINITY_MASK"].split(",")
-+        if device_ids == [""]:
-+            msg = (
-+                "ZE_AFFINITY_MASK is set to empty string, which means"
-+                " GPU support is disabled. If you are using ray, please unset"
-+                " the environment variable `ZE_AFFINITY_MASK` inside the"
-+                " worker/actor. "
-+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-+                " more information.")
-+            raise RuntimeError(msg)
-+        physical_device_id = device_ids[device_id]
-+        return int(physical_device_id)
-+    else:
-+        return device_id
+-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
++def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend:
+     """
+     Get the available attention backend for Vision Transformer.
+     """
+-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
++    # Lazy import to avoid circular dependency
++    from vllm.attention.selector import get_env_variable_attn_backend
+ 
+     selected_backend: Optional[_Backend] = get_env_variable_attn_backend()
+     if selected_backend is not None:
+         return selected_backend
+ 
+-    return current_platform.get_vit_attn_backend(support_fa)
+-
++    return current_platform.get_vit_attn_backend(head_size, dtype)
+ 
+ def resolve_visual_encoder_outputs(
+     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+@@ -122,4 +121,4 @@ def resolve_visual_encoder_outputs(
+     uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+     if post_layer_norm is not None and uses_last_layer:
+         hs_pool[-1] = post_layer_norm(encoder_outputs)
+-    return torch.cat(hs_pool, dim=-1)
+\ No newline at end of file
++    return torch.cat(hs_pool, dim=-1)
+diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
+index df6e19da8..8c6627186 100644
+--- a/vllm/multimodal/video.py
++++ b/vllm/multimodal/video.py
+@@ -159,6 +159,20 @@ class OpenCVVideoBackend(VideoLoader):
+         assert i == num_frames, (f"Expected reading {num_frames} frames, "
+                                  f"but only loaded {i} frames from video.")
+ 
++        # Use transformers transformers.video_utils.VideoMetadata format
++        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
++        # can cause incorrect timestamp calculation without num_frames=-1.
++        metadata = {
++            "total_num_frames": num_frames,
++            "fps": num_frames / duration,
++            "duration": duration,
++            "video_backend": "opencv",
++            "frames_indices": list(range(num_frames)),
++            # extra field used to control hf processor's video
++            # sampling behavior
++            "do_sample_frames": num_frames == total_frames_num,
++        }
 +
+         return frames, metadata
+ 
+ 
+diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
+index e40b6eb2b..77c9a012b 100644
+--- a/vllm/platforms/cuda.py
++++ b/vllm/platforms/cuda.py
+@@ -209,18 +209,24 @@ class CudaPlatformBase(Platform):
+         return torch.cuda.max_memory_allocated(device)
+ 
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+-        if cls.has_device_capability(80) and support_fa:
+-            from transformers.utils import is_flash_attn_2_available
+-            if is_flash_attn_2_available():
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
++        if dtype not in (torch.float16, torch.bfloat16):
++            return _Backend.XFORMERS
++
++        if cls.has_device_capability(80):
++            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
++            from vllm.attention.selector import is_attn_backend_supported
++            is_default_fa_supported = is_attn_backend_supported(
++                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False)
++            if is_default_fa_supported:
+                 return _Backend.FLASH_ATTN
+-            logger.warning_once(
+-                "Current `vllm-flash-attn` has a bug inside vision "
+-                "module, so we use xformers backend instead. You can "
+-                "run `pip install flash-attn` to use flash-attention "
+-                "backend.")
+-        # Fallback for Volta/Turing GPUs or FA not supported
+-        return _Backend.XFORMERS
++            else:
++                # Fallback to XFORMERS
++                return _Backend.XFORMERS
++        else:
++            # Fallback for Volta/Turing GPUs or FA not supported
++            return _Backend.XFORMERS
+ 
+     @classmethod
+     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
+index 59aa46818..054d08c3a 100644
+--- a/vllm/platforms/interface.py
++++ b/vllm/platforms/interface.py
+@@ -192,7 +192,8 @@ class Platform:
+             return device_id
+ 
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
+         return _Backend.TORCH_SDPA
+ 
+     @classmethod
+diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
+index f4d136c5e..bb8bff48c 100644
+--- a/vllm/platforms/rocm.py
++++ b/vllm/platforms/rocm.py
+@@ -175,15 +175,15 @@ class RocmPlatform(Platform):
+     ]
  
- class XPUPlatform(Platform):
-     _enum = PlatformEnum.XPU
-@@ -30,7 +48,7 @@ class XPUPlatform(Platform):
-     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
-     ray_device_key: str = "GPU"
-     dist_backend: str = "ccl"  # ccl | xccl
--    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
-+    device_control_env_var: str = "ZE_AFFINITY_MASK"
+     @classmethod
+-    def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
+-        if support_fa:
+-            if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
+-                    and on_gfx9()):
+-                # Note: AITER FA is only supported for Qwen-VL models.
+-                # TODO: Add support for other VL models in their model class.
+-                return _Backend.ROCM_AITER_FA
+-            if on_gfx9():
+-                return _Backend.FLASH_ATTN
++    def get_vit_attn_backend(cls, head_size: int,
++                             dtype: torch.dtype) -> _Backend:
++        if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
++                and on_gfx9()):
++            # Note: AITER FA is only supported for Qwen-VL models.
++            # TODO: Add support for other VL models in their model class.
++            return _Backend.ROCM_AITER_FA
++        if on_gfx9():
++            return _Backend.FLASH_ATTN
+         return _Backend.TORCH_SDPA
  
      @classmethod
-     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-@@ -78,6 +96,10 @@ class XPUPlatform(Platform):
+diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
+index 32208e7ff..7a4b1679a 100644
+--- a/vllm/platforms/xpu.py
++++ b/vllm/platforms/xpu.py
+@@ -102,6 +102,10 @@ class XPUPlatform(Platform):
      def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
          return True
  
 +    @classmethod
-+    def get_piecewise_backend_cls(cls) -> str:
-+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
++    def get_vit_attn_backend(cls, head_size:int, support_fa: bool = False) -> _Backend:
++        return _Backend.IPEX
 +
      @classmethod
      def inference_mode(cls):
          return torch.no_grad()
-@@ -85,6 +107,7 @@ class XPUPlatform(Platform):
-     @classmethod
-     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-         cache_config = vllm_config.cache_config
-+        vllm_config.compilation_config.use_cudagraph = False
-         model_config = vllm_config.model_config
-         # in V1(or with ipex chunked prefill) block_size is 64
-         if cache_config and cache_config.block_size is None:
-@@ -173,6 +196,13 @@ class XPUPlatform(Platform):
-                 device_name)
-             return True
+@@ -127,6 +131,8 @@ class XPUPlatform(Platform):
+         if vllm_config.lora_config is not None:
+             compilation_config.level = CompilationLevel.NO_COMPILATION
  
-+    @classmethod
-+    def fp8_dtype(cls) -> torch.dtype:
++        if compilation_config.compile_sizes is None:
++            compilation_config.compile_sizes = {}
+         # check and update parallel config
+         parallel_config = vllm_config.parallel_config
+         parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+@@ -184,7 +190,10 @@ class XPUPlatform(Platform):
+ 
+     @classmethod
+     def fp8_dtype(cls) -> torch.dtype:
+-        return torch.float8_e5m2
 +        if envs.VLLM_XPU_FP8_DTYPE == "e4m3":
 +            return torch.float8_e4m3fn
 +        else:
 +            return torch.float8_e5m2
-+
+ 
      @classmethod
      def is_data_center_gpu(cls) -> bool:
-         device_name = cls.get_device_name().lower()
-diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
-index 51c78ddc1..93bf5ec25 100644
---- a/vllm/plugins/__init__.py
-+++ b/vllm/plugins/__init__.py
-@@ -68,13 +68,6 @@ def load_general_plugins():
-         return
-     plugins_loaded = True
- 
--    # some platform-specific configurations
--    from vllm.platforms import current_platform
--
--    if current_platform.is_xpu():
--        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
--        torch._dynamo.config.disable = True
--
-     plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
-     # general plugins, we only need to execute the loaded functions
-     for func in plugins.values():
-diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
-index e0ef7f099..d09c5fa92 100644
---- a/vllm/transformers_utils/chat_templates/registry.py
-+++ b/vllm/transformers_utils/chat_templates/registry.py
-@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
-     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
- 
- 
-+def _get_minicpmv_chat_template_fallback(
-+        tokenizer_name_or_path: str) -> Optional[Path]:
-+    # MiniCPM-V-4.5 version uses a dedicated template
-+    if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
-+        return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
-+
-+    # Other versions use chatml template
-+    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
-+
-+
- # yapf: disable
- _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
-     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
-@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
-     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
-     "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
-     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
-+    "minicpmv": _get_minicpmv_chat_template_fallback,
-     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
-     "qwen": _get_qwen_chat_template_fallback,
- }
-diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
+diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
+index cdae59ccc..bc78ad544 100644
+--- a/vllm/transformers_utils/configs/__init__.py
++++ b/vllm/transformers_utils/configs/__init__.py
+@@ -9,6 +9,7 @@ Model configs may be defined in this directory for the following reasons:
+ 
+ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+ from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
++from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
+ from vllm.transformers_utils.configs.eagle import EAGLEConfig
+ # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+@@ -34,6 +35,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+ __all__ = [
+     "ChatGLMConfig",
+     "DeepseekVLV2Config",
++    "DotsOCRConfig",
+     "EAGLEConfig",
+     "RWConfig",
+     "JAISConfig",
+diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py
 new file mode 100644
-index 000000000..661ebd1cf
+index 000000000..6bb3c12d9
 --- /dev/null
-+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
-@@ -0,0 +1,93 @@
-+{%- set enable_thinking = enable_thinking | default(false) %}
-+{%- if tools %}
-+    {{- '<|im_start|>system\n' }}
-+    {%- if messages[0].role == 'system' %}
-+        {{- messages[0].content + '\n\n' }}
-+    {%- endif %}
-+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-+    {%- for tool in tools %}
-+        {{- "\n" }}
-+        {{- tool | tojson }}
-+    {%- endfor %}
-+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-+{%- else %}
-+    {%- if messages[0].role == 'system' %}
-+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-+    {%- endif %}
-+{%- endif %}
-+
-+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-+{%- for message in messages[::-1] %}
-+    {%- set index = (messages|length - 1) - loop.index0 %}
-+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-+        {%- set ns.multi_step_tool = false %}
-+        {%- set ns.last_query_index = index %}
-+    {%- endif %}
-+{%- endfor %}
-+
-+{%- for message in messages %}
-+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-+    {%- elif message.role == "assistant" %}
-+        {%- set content = message.content %}
-+        {%- set reasoning_content = '' %}
-+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-+            {%- set reasoning_content = message.reasoning_content %}
-+        {%- else %}
-+            {%- if '</think>' in message.content %}
-+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-+            {%- endif %}
-+        {%- endif %}
-+        {%- if loop.index0 > ns.last_query_index %}
-+            {%- if loop.last or (not loop.last and reasoning_content) %}
-+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-+            {%- else %}
-+                {{- '<|im_start|>' + message.role + '\n' + content }}
-+            {%- endif %}
-+        {%- else %}
-+            {{- '<|im_start|>' + message.role + '\n' + content }}
-+        {%- endif %}
-+
-+        {%- if message.tool_calls %}
-+            {%- for tool_call in message.tool_calls %}
-+                {%- if (loop.first and content) or (not loop.first) %}
-+                    {{- '\n' }}
-+                {%- endif %}
-+                {%- if tool_call.function %}
-+                    {%- set tool_call = tool_call.function %}
-+                {%- endif %}
-+                {{- '<tool_call>\n{"name": "' }}
-+                {{- tool_call.name }}
-+                {{- '", "arguments": ' }}
-+                {%- if tool_call.arguments is string %}
-+                    {{- tool_call.arguments }}
-+                {%- else %}
-+                    {{- tool_call.arguments | tojson }}
-+                {%- endif %}
-+                {{- '}\n</tool_call>' }}
-+            {%- endfor %}
-+        {%- endif %}
-+        {{- '<|im_end|>\n' }}
-+    {%- elif message.role == "tool" %}
-+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-+            {{- '<|im_start|>user' }}
-+        {%- endif %}
-+        {{- '\n<tool_response>\n' }}
-+        {{- message.content }}
-+        {{- '\n</tool_response>' }}
-+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-+            {{- '<|im_end|>\n' }}
-+        {%- endif %}
-+    {%- endif %}
-+{%- endfor %}
-+
-+{%- if add_generation_prompt %}
-+    {{- '<|im_start|>assistant\n' }}
-+    {%- if enable_thinking is defined and enable_thinking is false %}
-+        {{- '<think>\n\n</think>\n\n' }}
-+    {%- endif %}
-+    {%- if enable_thinking is defined and enable_thinking is true %}
-+        {{- '<think>\n' }}
-+    {%- endif %}
-+{%- endif %}
-\ No newline at end of file
-diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
-index 8d1f59e6e..0d96bcfef 100644
---- a/vllm/transformers_utils/config.py
-+++ b/vllm/transformers_utils/config.py
-@@ -264,7 +264,8 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
- 
- def uses_mrope(config: PretrainedConfig) -> bool:
-     """Detect if the model with this config uses M-ROPE."""
--    return _uses_mrope(config) or thinker_uses_mrope(config)
-+    return _uses_mrope(config) or _uses_mrope(
-+        config.get_text_config()) or thinker_uses_mrope(config)
++++ b/vllm/transformers_utils/configs/dotsocr.py
+@@ -0,0 +1,69 @@
++# SPDX-License-Identifier: Apache-2.0
++# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++from typing import Any, Optional
++
++from transformers.configuration_utils import PretrainedConfig
++from transformers.models.qwen2 import Qwen2Config
++
++
++class DotsVisionConfig(PretrainedConfig):
++    model_type: str = "dots_vit"
++
++    def __init__(
++        self,
++        embed_dim: int = 1536,  # vision encoder embed size
++        hidden_size: int = 1536,  # after merger hidden size
++        intermediate_size: int = 4224,
++        num_hidden_layers: int = 42,
++        num_attention_heads: int = 12,
++        num_channels: int = 3,
++        patch_size: int = 14,
++        spatial_merge_size: int = 2,
++        temporal_patch_size: int = 1,
++        rms_norm_eps: float = 1e-5,
++        use_bias: bool = False,
++        attn_implementation="flash_attention_2",
++        initializer_range=0.02,
++        init_merger_std=0.02,
++        is_causal=False,  # ve causal forward
++        post_norm=True,
++        gradient_checkpointing=False,
++        **kwargs: Any,
++    ):
++        super().__init__(**kwargs)
++        self.embed_dim = embed_dim
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        self.num_channels = num_channels
++        self.patch_size = patch_size
++        self.spatial_merge_size = spatial_merge_size
++        self.temporal_patch_size = temporal_patch_size
++        self.rms_norm_eps = rms_norm_eps
++        self.use_bias = use_bias
++        self.attn_implementation = attn_implementation
++        self.initializer_range = initializer_range
++        self.init_merger_std = init_merger_std
++        self.is_causal = is_causal
++        self.post_norm = post_norm
++        self.gradient_checkpointing = gradient_checkpointing
++
++
++class DotsOCRConfig(Qwen2Config):
++    model_type = "dots_ocr"
++
++    def __init__(self,
++                 image_token_id=151665,
++                 video_token_id=151656,
++                 vision_config: Optional[dict] = None,
++                 *args,
++                 **kwargs):
++        super().__init__(*args, **kwargs)
++        self.image_token_id = image_token_id
++        self.video_token_id = video_token_id
++        self.vision_config = DotsVisionConfig(**(vision_config or {}))
++
++    def save_pretrained(self, save_directory, **kwargs):
++        self._auto_class = None
++        super().save_pretrained(save_directory, **kwargs)
+diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
+index b3f1977f2..7d75433c4 100644
+--- a/vllm/transformers_utils/tokenizer.py
++++ b/vllm/transformers_utils/tokenizer.py
+@@ -212,6 +212,29 @@ def get_tokenizer(
+                                                     revision=revision,
+                                                     download_dir=download_dir,
+                                                     **kwargs)
++    elif tokenizer_mode == "bpe-qwen":
++        try:
++            from vllm.transformers_utils.tokenizers.bpe_qwen import BPEQwenTokenizer
++            from bpe_qwen import AutoLinearTokenizer
++
++            raw_tokenizer = AutoLinearTokenizer.from_pretrained(
++                str(tokenizer_name),
++                revision=revision,
++                **kwargs
++            )
++            # 使用包装类将 dict 转换为 BatchEncoding
++            tokenizer = BPEQwenTokenizer(raw_tokenizer)
++        except ImportError as e:
++            logger.warning(
++                f"Failed to import bpe_qwen, falling back to AutoTokenizer: {e}"
++            )
++            tokenizer = AutoTokenizer.from_pretrained(
++                tokenizer_name,
++                *args,
++                trust_remote_code=trust_remote_code,
++                revision=revision,
++                **kwargs,
++            )
+     else:
+         try:
+             tokenizer = AutoTokenizer.from_pretrained(
+diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
+index 941156c4b..370eea683 100644
+--- a/vllm/transformers_utils/tokenizers/__init__.py
++++ b/vllm/transformers_utils/tokenizers/__init__.py
+@@ -1,10 +1,11 @@
+ # SPDX-License-Identifier: Apache-2.0
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
  
++from .bpe_qwen import BPEQwenTokenizer
+ from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                       truncate_tool_call_ids, validate_request_params)
  
- def thinker_uses_mrope(config: PretrainedConfig) -> bool:
-diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
+ __all__ = [
+     "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+-    "validate_request_params"
++    "validate_request_params", "BPEQwenTokenizer"
+ ]
+diff --git a/vllm/transformers_utils/tokenizers/bpe_qwen.py b/vllm/transformers_utils/tokenizers/bpe_qwen.py
 new file mode 100644
-index 000000000..0959d4a00
+index 000000000..c0730208f
 --- /dev/null
-+++ b/vllm/utils/tensor_schema.py
-@@ -0,0 +1,236 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
-+                    get_type_hints)
++++ b/vllm/transformers_utils/tokenizers/bpe_qwen.py
+@@ -0,0 +1,200 @@
++from typing import Any, Optional, Union
 +
-+import torch
++from transformers.tokenization_utils_base import BatchEncoding
 +
 +from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer_base import TokenizerBase
++from vllm.utils import is_list_of
 +
 +logger = init_logger(__name__)
 +
 +
-+class TensorShape:
++class BPEQwenTokenizer(TokenizerBase):
++    """Wrapper for bpe-qwen tokenizer to make it compatible with vLLM."""
 +
-+    def __init__(
-+        self,
-+        *dims: Union[int, str],
-+        dynamic_dims: Optional[set[str]] = None,
-+    ) -> None:
-+        super().__init__()
++    def __init__(self, tokenizer: Any) -> None:
++        """
++        Args:
++            tokenizer: The bpe-qwen AutoLinearTokenizer instance
++        """
++        self.bpe_qwen = tokenizer
++        self._vocab = tokenizer.get_vocab() if hasattr(tokenizer, 'get_vocab') else {}
++        self._vocab_size = len(self._vocab) if self._vocab else getattr(tokenizer, 'vocab_size', 0)
++        self._max_token_id = self._vocab_size - 1
 +
-+        self.dims = dims
-+        self.dynamic_dims = dynamic_dims if dynamic_dims else set()
++    @classmethod
++    def from_pretrained(
++        cls, path_or_repo_id: str, *, revision: Optional[str] = None
++    ) -> "BPEQwenTokenizer":
++        """Load tokenizer from pretrained model."""
++        from bpe_qwen import AutoLinearTokenizer
++
++        bpe_qwen_tokenizer = AutoLinearTokenizer.from_pretrained(
++            path_or_repo_id, revision=revision
++        )
++        return cls(bpe_qwen_tokenizer)
 +
-+    def resolve(self, **bindings: dict[str,
-+                                       int]) -> tuple[Union[int, str], ...]:
-+        resolved = []
-+        for dim in self.dims:
-+            if isinstance(dim, str) and dim in bindings:
-+                resolved.append(bindings[dim])
-+            else:
-+                resolved.append(dim)
-+        return tuple(resolved)
-+
-+    def __str__(self) -> str:
-+        """Return a string representation of the tensor shape."""
-+        dim_strs = []
-+        for dim in self.dims:
-+            if isinstance(dim, str):
-+                if dim in self.dynamic_dims:
-+                    dim_strs.append(
-+                        f"{dim}*")  # Mark dynamic dimensions with *
-+                else:
-+                    dim_strs.append(dim)
-+            else:
-+                dim_strs.append(str(dim))
-+        return f"({', '.join(dim_strs)})"
++    # Properties required by vLLM
++    @property
++    def all_special_tokens_extended(self) -> list[str]:
++        if hasattr(self.bpe_qwen, 'all_special_tokens'):
++            return self.bpe_qwen.all_special_tokens
++        return []
 +
++    @property
++    def all_special_tokens(self) -> list[str]:
++        return self.all_special_tokens_extended
 +
-+class TensorSchema:
++    @property
++    def all_special_ids(self) -> list[int]:
++        if hasattr(self.bpe_qwen, 'all_special_ids'):
++            return self.bpe_qwen.all_special_ids
++        return []
 +
-+    def __init__(
-+        self,
-+        *,
-+        validate: bool = True,
-+        resolve_bindings: Optional[dict[str, int]] = None,
-+        **kwargs: Any,
-+    ) -> None:
-+        super().__init__()
++    @property
++    def bos_token_id(self) -> int:
++        return getattr(self.bpe_qwen, 'bos_token_id', 0)
 +
-+        self._resolve_bindings = resolve_bindings if resolve_bindings else {}
++    @property
++    def eos_token_id(self) -> int:
++        return getattr(self.bpe_qwen, 'eos_token_id', 0)
 +
-+        for key, value in kwargs.items():
-+            setattr(self, key, value)
++    @property
++    def pad_token_id(self) -> Optional[int]:
++        return getattr(self.bpe_qwen, 'pad_token_id', None)
 +
-+        if validate:
-+            self.validate()
++    @property
++    def is_fast(self) -> bool:
++        return True
 +
-+    def __getitem__(self, key: str) -> Any:
-+        return getattr(self, key)
++    @property
++    def vocab_size(self) -> int:
++        return self._vocab_size
 +
-+    def get(self, key: str, default: Any = None) -> Any:
-+        return getattr(self, key, default)
++    @property
++    def max_token_id(self) -> int:
++        return self._max_token_id
 +
-+    def _match_shape_with_dynamic(
-+        self,
-+        actual: tuple[int, ...],
-+        reference: tuple[int, ...],
-+        expected_shape: tuple[Union[int, str], ...],
-+        dynamic_dims: set[str],
-+    ) -> bool:
-+        if len(actual) != len(reference) or len(actual) > len(expected_shape):
-+            return False
-+
-+        for i, (a, r) in enumerate(zip(actual, reference)):
-+            # When validating list inputs, we match shape suffixes only
-+            # (e.g. "p", 3, "h", "w"), assuming the list length corresponds
-+            # to the leading symbolic dim (e.g. "bn"). This allows comparing
-+            # only the trailing dimensions of each element in the list.
-+            dim = expected_shape[-len(actual) + i]
-+            # Skip this dimension if it's marked dynamic
-+            if dim in dynamic_dims:
-+                continue
-+            if a != r:
-+                return False
-+        return True
++    def __len__(self) -> int:
++        return self.vocab_size
 +
-+    def _validate_nested_tensors(
++    def __call__(
 +        self,
-+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
-+        field_name: str,
-+        expected_shape: tuple[Union[int, str], ...],
-+        dynamic_dims: set[str],
-+    ) -> tuple[int, ...]:
-+        """Validate a list/tuple of tensors and return the actual shape."""
-+        # Ensure all tensors in the list have the same
-+        # shape, besides dynamic dimensions
-+        first = value[0]
-+        for i, v in enumerate(value):
-+            if not isinstance(v, torch.Tensor):
-+                raise ValueError(f"{field_name}[{i}] is not a "
-+                                 f"torch.Tensor")
-+            if not self._match_shape_with_dynamic(
-+                    v.shape,
-+                    first.shape,
-+                    expected_shape,
-+                    dynamic_dims,
-+            ):
-+                raise ValueError(f"{field_name} contains inconsistent "
-+                                 f"shapes: {first.shape} vs {v.shape} "
-+                                 f"at index {i}")
++        text: Union[str, list[str], list[int]],
++        text_pair: Optional[str] = None,
++        add_special_tokens: bool = False,
++        truncation: bool = False,
++        max_length: Optional[int] = None,
++        **kwargs  # 接受但忽略其他参数(如 padding)
++    ):
++        """Tokenize text and return BatchEncoding."""
++        input_ids: Union[list[int], list[list[int]]]
++
++        # For list[str], batch of texts
++        if is_list_of(text, str):
++            input_ids_: list[list[int]] = []
++            for p in text:
++                each_input_ids = self.encode_one(p, truncation, max_length)
++                input_ids_.append(each_input_ids)
++            input_ids = input_ids_
++        # For list[int], already tokenized
++        elif is_list_of(text, int):
++            input_ids = text
++        # For str, single text
++        else:
++            input_ids = self.encode_one(text, truncation, max_length)
 +
-+        # Treat the list as a stacked tensor:
-+        # shape = (len(list), *tensor.shape)
-+        return (len(value), ) + first.shape
++        # 构建完整的 BatchEncoding
++        result = {"input_ids": input_ids}
 +
-+    def _validate_tensor_shape_expected(
-+        self,
-+        actual_shape: tuple[int, ...],
-+        expected_shape: tuple[Union[int, str], ...],
-+        field_name: str,
-+        shape_env: dict[str, int],
-+        dynamic_dims: set[str],
-+    ) -> None:
-+        """Validate that the actual tensor shape matches the expected shape."""
++        # 添加 attention_mask
++        if isinstance(input_ids[0], list):
++            result["attention_mask"] = [[1] * len(ids) for ids in input_ids]
++        else:
++            result["attention_mask"] = [1] * len(input_ids)
 +
-+        if len(actual_shape) != len(expected_shape):
-+            raise ValueError(f"{field_name} has rank {len(actual_shape)} "
-+                             f"but expected {len(expected_shape)}")
++        return BatchEncoding(result)
 +
-+        for i, dim in enumerate(expected_shape):
-+            if dim in dynamic_dims:
-+                continue
-+            elif isinstance(dim, int):
-+                if actual_shape[i] != dim:
-+                    raise ValueError(f"{field_name} dim[{i}] expected "
-+                                     f"{dim}, got {actual_shape[i]}")
-+            elif isinstance(dim, str):
-+                if dim in shape_env:
-+                    if actual_shape[i] != shape_env[dim]:
-+                        raise ValueError(f"{field_name} dim[{i}] expected "
-+                                         f"'{dim}'={shape_env[dim]}, got "
-+                                         f"{actual_shape[i]}")
-+                else:
-+                    shape_env[dim] = actual_shape[i]
-+            else:
-+                raise TypeError(f"{field_name} dim[{i}] has unsupported "
-+                                f"type: {type(dim)}")
-+
-+    def validate(self) -> None:
-+        type_hints = get_type_hints(self.__class__, include_extras=True)
-+        shape_env = {}
-+
-+        for field_name, field_type in type_hints.items():
-+            # Check if field is missing
-+            if (not hasattr(self, field_name)
-+                    or getattr(self, field_name) is None):
-+                # Check if field is marked as optional
-+                actual_type = field_type
-+                if get_origin(field_type) is Annotated:
-+                    args = get_args(field_type)
-+                    actual_type = args[0]
-+
-+                # Check arg was provided as Union
-+                if get_origin(actual_type) is Union:
-+                    args = get_args(actual_type)
-+                    # Skip validation when Union contains None
-+                    if type(None) in args:
-+                        continue
-+                # Otherwise field is required, raise error
-+                raise ValueError(f"Required field '{field_name}' is missing")
-+
-+            # Field exists, proceed with validation
-+            value = getattr(self, field_name)
-+            if get_origin(field_type) is not None:
-+                args = get_args(field_type)
-+
-+                for arg in args:
-+                    if isinstance(arg, TensorShape):
-+                        expected_shape = arg.resolve(**self._resolve_bindings)
-+                        if isinstance(value, (list, tuple)):
-+                            # list/tuple of Tensors → shape = (len(value), ...)
-+                            if value and isinstance(value[0], torch.Tensor):
-+                                actual_shape = self._validate_nested_tensors(
-+                                    value, field_name, expected_shape,
-+                                    arg.dynamic_dims)
-+                            elif value:
-+                                # list/tuple of scalars → shape = (len(value),)
-+                                actual_shape = (len(value), )
-+                            else:
-+                                raise ValueError(
-+                                    f"{field_name} is an empty list")
-+
-+                        # Tensor → shape = tensor.shape
-+                        elif isinstance(value, torch.Tensor):
-+                            actual_shape = value.shape
-+
-+                        # Otherwise, it's an unsupported type
-+                        else:
-+                            type_names = []
-+                            for arg in args:
-+                                if hasattr(arg, "__name__"):
-+                                    type_names.append(str(arg.__name__))
-+                                else:
-+                                    type_names.append(str(arg))
-+
-+                            expected_types = ", ".join(type_names)
-+                            raise ValueError(
-+                                f"{field_name} is not one of the expected "
-+                                f"types: {expected_types}")
-+
-+                        self._validate_tensor_shape_expected(
-+                            actual_shape, expected_shape, field_name,
-+                            shape_env, arg.dynamic_dims)
-+
-+    def print_shapes(self) -> None:
-+        """Print TensorShape annotations for debugging."""
-+        logger.debug("Shapes in %s:", self.__class__.__name__)
-+        type_hints = get_type_hints(self.__class__, include_extras=True)
-+
-+        for field_name, field_type in type_hints.items():
-+            if get_origin(field_type) is not None:
-+                args = get_args(field_type)
-+                for arg in args:
-+                    if isinstance(arg, TensorShape):
-+                        logger.debug("  %s: %s", field_name, str(arg))
-\ No newline at end of file
++    def get_vocab(self) -> dict[str, int]:
++        return self._vocab
++
++    def get_added_vocab(self) -> dict[str, int]:
++        # bpe-qwen tokenizers have no added vocabulary
++        return {}
++
++    def encode_one(
++        self,
++        text: str,
++        truncation: bool = False,
++        max_length: Optional[int] = None,
++    ) -> list[int]:
++        """Encode a single text."""
++        input_ids = self.encode(text)
++
++        if truncation and max_length:
++            input_ids = input_ids[:max_length]
++        return input_ids
++
++    def encode(
++        self,
++        text: str,
++        truncation: Optional[bool] = None,
++        max_length: Optional[int] = None,
++        add_special_tokens: Optional[bool] = None,
++    ) -> list[int]:
++        """Encode text to token IDs."""
++        return self.bpe_qwen.encode(text)
++
++    def decode(
++        self, ids: Union[list[int], int], skip_special_tokens: bool = True
++    ) -> str:
++        """Decode token IDs to text."""
++        if isinstance(ids, int):
++            ids = [ids]
++        return self.bpe_qwen.decode(ids, skip_special_tokens=skip_special_tokens)
++
++    def batch_decode(
++        self,
++        sequences: list[list[int]],
++        skip_special_tokens: bool = True
++    ) -> list[str]:
++        """Batch decode token IDs to texts."""
++        if hasattr(self.bpe_qwen, 'batch_decode'):
++            return self.bpe_qwen.batch_decode(sequences, skip_special_tokens=skip_special_tokens)
++        return [self.decode(seq, skip_special_tokens) for seq in sequences]
++
++    def convert_ids_to_tokens(
++        self,
++        ids: list[int],
++        skip_special_tokens: bool = True,
++    ) -> list[str]:
++        """Convert token IDs to tokens."""
++        if hasattr(self.bpe_qwen, 'convert_ids_to_tokens'):
++            return self.bpe_qwen.convert_ids_to_tokens(ids, skip_special_tokens)
++        # Fallback: decode each ID individually
++        return [self.bpe_qwen.decode([id]) for id in ids]
++
++    def convert_tokens_to_string(self, tokens: list[str]) -> str:
++        """Convert tokens to string."""
++        return "".join(tokens)
++
++    # Required abstract properties
++    @property
++    def sep_token(self) -> str:
++        raise NotImplementedError()
++
++    @property
++    def pad_token(self) -> str:
++        raise NotImplementedError()
++
++    def apply_chat_template(
++        self,
++        messages: list,
++        tools: Optional[list[dict[str, Any]]] = None,
++        **kwargs,
++    ) -> list[int]:
++        # 修正:使用 self.bpe_qwen 而不是 self.tokenizer
++        if hasattr(self.bpe_qwen, 'apply_chat_template'):
++            return self.bpe_qwen.apply_chat_template(messages, tools=tools, **kwargs)
++        else:
++            raise NotImplementedError("AutoLinearTokenizer does not support apply_chat_template")
 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
-index 5fe274f2c..4a8657ee5 100755
+index 20f1904b3..c02d74145 100755
 --- a/vllm/v1/attention/backends/flash_attn.py
 +++ b/vllm/v1/attention/backends/flash_attn.py
-@@ -24,12 +24,15 @@ if is_flash_attn_varlen_func_available():
+@@ -24,6 +24,7 @@ if is_flash_attn_varlen_func_available():
  
  from vllm.config import VllmConfig, get_layers_from_vllm_config
  from vllm.logger import init_logger
 +from vllm.platforms import current_platform
  from vllm.utils import cdiv
- from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                               CommonAttentionMetadata,
+ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                               AttentionMetadataBuilder,
+@@ -31,6 +32,8 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                                get_kv_cache_layout)
  from vllm.v1.kv_cache_interface import AttentionSpec
  
@@ -16550,7 +16562,7 @@ index 5fe274f2c..4a8657ee5 100755
  logger = init_logger(__name__)
  
  # NOTE(woosuk): This is an arbitrary number. Tune it if needed.
-@@ -46,7 +49,7 @@ class FlashAttentionBackend(AttentionBackend):
+@@ -47,7 +50,7 @@ class FlashAttentionBackend(AttentionBackend):
  
      @classmethod
      def get_supported_head_sizes(cls) -> list[int]:
@@ -16559,47 +16571,25 @@ index 5fe274f2c..4a8657ee5 100755
  
      @classmethod
      def validate_head_size(cls, head_size: int) -> None:
-@@ -125,11 +128,16 @@ class FlashAttentionMetadata:
-     prefix_kv_lens: Optional[torch.Tensor]
-     suffix_kv_lens: Optional[torch.Tensor]
- 
-+    # For XPU.
-+    seq_start_loc: Optional[torch.Tensor]
-+
-     # Optional aot scheduling
+@@ -137,6 +140,8 @@ class FlashAttentionMetadata:
      scheduler_metadata: Optional[torch.Tensor] = None
      prefix_scheduler_metadata: Optional[torch.Tensor] = None
      max_num_splits: int = 0
++    # For XPU.
++    seq_start_loc: Optional[torch.Tensor] = None
  
-+    causal: bool = True
-+
+     causal: bool = True
  
- def _get_sliding_window_configs(
-         vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
-@@ -209,10 +217,13 @@ class FlashAttentionMetadataBuilder(
+@@ -234,6 +239,8 @@ class FlashAttentionMetadataBuilder(
          max_query_len = common_attn_metadata.max_query_len
-         max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+         max_seq_len = common_attn_metadata.max_seq_len
          query_start_loc = common_attn_metadata.query_start_loc
 +        seq_start_loc = common_attn_metadata.seq_start_loc \
 +            if current_platform.is_xpu() else None
          seq_lens = common_attn_metadata.seq_lens
          seq_lens_cpu = common_attn_metadata.seq_lens_cpu
          block_table_tensor = common_attn_metadata.block_table_tensor
-         slot_mapping = common_attn_metadata.slot_mapping
-+        causal = common_attn_metadata.causal
- 
-         # the overhead of the aot schedule is not worth it for spec-decode
-         aot_schedule = self.aot_schedule and not fast_build
-@@ -288,7 +299,7 @@ class FlashAttentionMetadataBuilder(
-                                           max_query_len=max_query_len,
-                                           seqlens=seq_lens,
-                                           max_seq_len=max_seq_len,
--                                          causal=True)
-+                                          causal=causal)
- 
-         if self.use_full_cuda_graph:
-             assert scheduler_metadata is not None
-@@ -314,6 +325,7 @@ class FlashAttentionMetadataBuilder(
+@@ -345,6 +352,7 @@ class FlashAttentionMetadataBuilder(
              num_actual_tokens=num_actual_tokens,
              max_query_len=max_query_len,
              query_start_loc=query_start_loc,
@@ -16607,74 +16597,34 @@ index 5fe274f2c..4a8657ee5 100755
              max_seq_len=max_seq_len,
              seq_lens=seq_lens,
              block_table=block_table_tensor,
-@@ -326,7 +338,7 @@ class FlashAttentionMetadataBuilder(
-             suffix_kv_lens=suffix_kv_lens,
-             prefix_scheduler_metadata=prefix_scheduler_metadata,
-             max_num_splits=max_num_splits,
--        )
-+            causal=causal)
-         return attn_metadata
- 
-     def can_run_in_cudagraph(
-@@ -375,11 +387,14 @@ class FlashAttentionImpl(AttentionImpl):
- 
-         FlashAttentionBackend.validate_head_size(head_size)
+@@ -413,8 +421,6 @@ class FlashAttentionImpl(AttentionImpl):
  
--        if attn_type != AttentionType.DECODER:
--            raise NotImplementedError("Encoder self-attention and "
--                                      "encoder/decoder cross-attention "
--                                      "are not implemented for "
-+        if attn_type not in [
-+                AttentionType.DECODER, AttentionType.ENCODER_ONLY
-+        ]:
-+            raise NotImplementedError("Encoder/decoder cross-attention "
-+                                      "is not implemented for "
-                                       "FlashAttentionImpl")
-+
-+        self.attn_type = attn_type
-         self.vllm_flash_attn_version = get_flash_attn_version()
-         if is_quantized_kv_cache(self.kv_cache_dtype) \
-             and not flash_attn_supports_fp8():
-@@ -420,7 +435,9 @@ class FlashAttentionImpl(AttentionImpl):
+         self.sinks = sinks
+         if self.sinks is not None:
+-            assert self.vllm_flash_attn_version == 3, (
+-                "Sinks are only supported in FlashAttention 3")
+             assert self.sinks.shape[0] == num_heads, (
+                 "Sinks must have the same number of heads as the number of "
+                 "heads in the layer")
+@@ -455,7 +461,7 @@ class FlashAttentionImpl(AttentionImpl):
  
          if attn_metadata is None:
              # Profiling run.
 -            return output
 +            return output.uniform_()
-+
-+        attn_type = self.attn_type
  
-         # IMPORTANT!
-         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
-@@ -432,6 +449,18 @@ class FlashAttentionImpl(AttentionImpl):
-         # performance to make sure it does not introduce any overhead.
+         attn_type = self.attn_type
  
-         num_actual_tokens = attn_metadata.num_actual_tokens
-+
-+        # Handle encoder attention differently - no KV cache needed
-+        if attn_type in (AttentionType.ENCODER_ONLY, ):
-+            # For encoder attention,
-+            # we use direct Q, K, V tensors without caching
-+            return self._forward_encoder_attention(query[:num_actual_tokens],
-+                                                   key[:num_actual_tokens],
-+                                                   value[:num_actual_tokens],
-+                                                   output[:num_actual_tokens],
-+                                                   attn_metadata, layer)
-+
-+        # For decoder and cross-attention, use KV cache as before
-         key_cache, value_cache = kv_cache.unbind(0)
+@@ -528,6 +534,8 @@ class FlashAttentionImpl(AttentionImpl):
  
-         if self.kv_sharing_target_layer_name is None:
-@@ -472,6 +501,8 @@ class FlashAttentionImpl(AttentionImpl):
-             scheduler_metadata = attn_metadata.scheduler_metadata
+             descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
  
-             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 +            cu_seqlens_k = attn_metadata.seq_start_loc if \
 +                current_platform.is_xpu() else None
- 
              flash_attn_varlen_func(
                  q=query[:num_actual_tokens],
-@@ -480,10 +511,11 @@ class FlashAttentionImpl(AttentionImpl):
+                 k=key_cache,
+@@ -535,6 +543,7 @@ class FlashAttentionImpl(AttentionImpl):
                  out=output[:num_actual_tokens],
                  cu_seqlens_q=cu_seqlens_q,
                  max_seqlen_q=max_seqlen_q,
@@ -16682,49 +16632,10 @@ index 5fe274f2c..4a8657ee5 100755
                  seqused_k=seqused_k,
                  max_seqlen_k=max_seqlen_k,
                  softmax_scale=self.scale,
--                causal=True,
-+                causal=attn_metadata.causal,
-                 alibi_slopes=self.alibi_slopes,
-                 window_size=self.sliding_window,
-                 block_table=block_table,
-@@ -524,6 +556,86 @@ class FlashAttentionImpl(AttentionImpl):
-         )
-         return output
+@@ -614,6 +623,29 @@ class FlashAttentionImpl(AttentionImpl):
+             cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
+             self.num_kv_heads)
  
-+    def _forward_encoder_attention(
-+        self,
-+        query: torch.Tensor,
-+        key: torch.Tensor,
-+        value: torch.Tensor,
-+        output: torch.Tensor,
-+        attn_metadata: FlashAttentionMetadata,
-+        layer: torch.nn.Module,
-+    ) -> torch.Tensor:
-+        """Forward pass for encoder attention without KV cache.
-+
-+        Args:
-+            query: shape = [num_encoder_tokens, num_heads, head_size]
-+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
-+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
-+            output: shape = [num_encoder_tokens, num_heads, head_size]
-+            attn_metadata: Encoder attention metadata
-+            layer: The attention layer
-+        """
-+        # For encoder attention, process FP8 quantization if needed
-+        if self.kv_cache_dtype.startswith("fp8"):
-+            raise NotImplementedError(
-+                "quantization is not supported for encoder attention")
-+
-+        # Use encoder-specific metadata for sequence information
-+        cu_seqlens_q = attn_metadata.query_start_loc
-+        cu_seqlens_k = attn_metadata.query_start_loc
-+        max_seqlen_q = attn_metadata.max_query_len
-+        max_seqlen_k = attn_metadata.max_query_len
-+
-+        descale_shape = (
-+            cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
-+            self.num_kv_heads)
-+
 +        if current_platform.is_xpu():
 +            ipex_ops.varlen_attention(
 +                    query=query,
@@ -16748,37 +16659,14 @@ index 5fe274f2c..4a8657ee5 100755
 +                    )
 +            return output
 +
-+        # Call flash attention directly on Q, K, V tensors
-+        flash_attn_varlen_func(
-+            q=query,
-+            k=key,
-+            v=value,
-+            out=output,
-+            cu_seqlens_q=cu_seqlens_q,
-+            cu_seqlens_k=cu_seqlens_k,
-+            max_seqlen_q=max_seqlen_q,
-+            max_seqlen_k=max_seqlen_k,
-+            softmax_scale=self.scale,
-+            causal=False,  # Encoder attention is bidirectional
-+            alibi_slopes=self.alibi_slopes,
-+            window_size=self.sliding_window,
-+            softcap=self.logits_soft_cap,
-+            fa_version=self.vllm_flash_attn_version,
-+            q_descale=layer._q_scale.expand(descale_shape),
-+            k_descale=layer._k_scale.expand(descale_shape),
-+            v_descale=layer._v_scale.expand(descale_shape),
-+        )
-+
-+        return output
-+
- 
- def use_cascade_attention(
-     common_prefix_len: int,
+         # Call flash attention directly on Q, K, V tensors
+         flash_attn_varlen_func(
+             q=query,
 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
-index fc8649d58..b7d1b84cc 100644
+index 009943fa7..647b3e88a 100644
 --- a/vllm/v1/attention/backends/utils.py
 +++ b/vllm/v1/attention/backends/utils.py
-@@ -41,6 +41,9 @@ class CommonAttentionMetadata:
+@@ -46,6 +46,9 @@ class CommonAttentionMetadata:
      query_start_loc_cpu: torch.Tensor
      """(batch_size + 1,), the start location of each request in query Tensor"""
  
@@ -16788,16 +16676,7 @@ index fc8649d58..b7d1b84cc 100644
      seq_lens: torch.Tensor
      seq_lens_cpu: torch.Tensor
      """(batch_size,), the length of each request including both computed tokens
-@@ -59,6 +62,8 @@ class CommonAttentionMetadata:
-     block_table_tensor: torch.Tensor
-     slot_mapping: torch.Tensor
- 
-+    causal: bool = True
-+
- 
- M = TypeVar("M")
- 
-@@ -387,6 +392,8 @@ def make_local_attention_virtual_batches(
+@@ -566,6 +569,8 @@ def make_local_attention_virtual_batches(
          query_start_loc_cpu=query_start_loc_cpu,
          query_start_loc=query_start_loc_cpu.to(device=device,
                                                 non_blocking=True),
@@ -16806,68 +16685,24 @@ index fc8649d58..b7d1b84cc 100644
          seq_lens_cpu=seq_lens_cpu,
          seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
          num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
-@@ -395,6 +402,7 @@ def make_local_attention_virtual_batches(
-         max_query_len=seqlens_q_local.max(),
-         block_table_tensor=block_table_local,
-         slot_mapping=common_attn_metadata.slot_mapping,
-+        causal=True,
-     )
- 
- 
-diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
-index 7779b559c..346638da3 100644
---- a/vllm/v1/engine/core.py
-+++ b/vllm/v1/engine/core.py
-@@ -109,6 +109,12 @@ class EngineCore:
-                 "compatibility may not be maintained.",
-                 vllm_config.scheduler_config.scheduler_cls)
- 
-+        if len(kv_cache_config.kv_cache_groups) == 0:
-+            # Encoder models without KV cache don't support
-+            # chunked prefill. But do SSM models?
-+            logger.info("Disabling chunked prefill for model without KVCache")
-+            vllm_config.scheduler_config.chunked_prefill_enabled = False
-+
-         self.scheduler: SchedulerInterface = Scheduler(
-             vllm_config=vllm_config,
-             kv_cache_config=kv_cache_config,
-diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
-index 50b9634a4..1649b8ff7 100644
---- a/vllm/v1/executor/abstract.py
-+++ b/vllm/v1/executor/abstract.py
-@@ -30,6 +30,7 @@ class Executor(ExecutorBase):
-         parallel_config = vllm_config.parallel_config
-         distributed_executor_backend = (
-             parallel_config.distributed_executor_backend)
-+        data_parallel_size = parallel_config.data_parallel_size
-         # distributed_executor_backend must be set in VllmConfig.__post_init__
-         if isinstance(distributed_executor_backend, type):
-             if not issubclass(distributed_executor_backend, ExecutorBase):
-diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
-index b86ac048f..277c16ff8 100644
---- a/vllm/v1/executor/ray_distributed_executor.py
-+++ b/vllm/v1/executor/ray_distributed_executor.py
-@@ -11,6 +11,8 @@ from vllm.logger import init_logger
- from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
- from vllm.v1.executor.abstract import Executor
- from vllm.v1.outputs import ModelRunnerOutput
-+import ray
-+import cloudpickle
+diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
+index 2c0eac3dd..c4a229b78 100644
+--- a/vllm/v1/core/kv_cache_utils.py
++++ b/vllm/v1/core/kv_cache_utils.py
+@@ -1048,7 +1048,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+         kv_cache_spec: The kv cache spec of each attention layer in the model
+     """
  
- logger = init_logger(__name__)
+-    if is_kv_cache_type_uniform(kv_cache_spec):
++    if not kv_cache_spec or is_kv_cache_type_uniform(kv_cache_spec):
+         return
  
-@@ -103,4 +105,4 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
-         if reconfig_request.new_data_parallel_rank == \
-         ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
-             self.shutdown()
--        return
-\ No newline at end of file
-+        return
+     logger.warning(
 diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
-index 967847c02..80c103829 100644
+index 7132d507c..a9ade77ea 100644
 --- a/vllm/v1/spec_decode/eagle.py
 +++ b/vllm/v1/spec_decode/eagle.py
-@@ -322,6 +322,8 @@ class EagleProposer:
+@@ -595,6 +595,8 @@ class EagleProposer:
                                                         non_blocking=True),
              seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
              query_start_loc_cpu=new_query_start_loc_cpu,
@@ -16876,89 +16711,36 @@ index 967847c02..80c103829 100644
              seq_lens_cpu=new_seq_lens_cpu,
              num_computed_tokens_cpu=common_attn_metadata.
              num_computed_tokens_cpu,
-@@ -330,6 +332,7 @@ class EagleProposer:
-             max_query_len=new_query_len_per_req.max().item(),
-             block_table_tensor=common_attn_metadata.block_table_tensor,
-             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
-+            causal=True,
-         )
- 
-         return spec_common_attn_metadata, token_indices
-diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
-index ca94ac8c6..6b2b50a57 100644
---- a/vllm/v1/worker/cpu_model_runner.py
-+++ b/vllm/v1/worker/cpu_model_runner.py
-@@ -4,6 +4,7 @@ from contextlib import contextmanager
- from typing import Any
- 
- import torch
-+import torch.nn as nn
- 
- from vllm.config import VllmConfig
- from vllm.logger import init_logger
-@@ -59,6 +60,9 @@ class CPUModelRunner(GPUModelRunner):
-                                               self.scheduler_config,
-                                               self.lora_config, self.device)
- 
-+    def get_model(self) -> nn.Module:
-+        return self.model
-+
-     def warming_up_model(self) -> None:
-         logger.info("Warming up model for the compilation...")
-         # Only generate graph for the generic shape
+diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
+index 5e00f6380..c296b3f28 100644
+--- a/vllm/v1/structured_output/backend_xgrammar.py
++++ b/vllm/v1/structured_output/backend_xgrammar.py
+@@ -108,7 +108,9 @@ class XgrammarBackend(StructuredOutputBackend):
+                     end=s["end"],
+                 ) for s in s_tag["structures"]
+             ]
+-            ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
++            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
++                tags, s_tag["triggers"])
++            ctx = self.compiler.compile_structural_tag(structural_tag)
+         else:
+             logger.error(
+                 "Validation should have already occurred. Please file an issue."
+@@ -318,6 +320,8 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+                     end=s["end"],
+                 ) for s in s_tag["structures"]
+             ]
+-            xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
++            structural_tag = xgr.StructuralTag.from_legacy_structural_tag(
++                tags, s_tag["triggers"])
++            xgr.Grammar.from_structural_tag(structural_tag)
+         except Exception as e:
+             raise ValueError("Invalid structural tag specification.") from e
 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
-index a5bf197ba..f17c2fa1b 100644
+index ebb18e81c..9055c6610 100644
 --- a/vllm/v1/worker/gpu_model_runner.py
 +++ b/vllm/v1/worker/gpu_model_runner.py
-@@ -1,7 +1,6 @@
- # SPDX-License-Identifier: Apache-2.0
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- 
--import copy
- import gc
- import time
- from contextlib import contextmanager
-@@ -23,12 +22,10 @@ from vllm.config import (CompilationLevel, VllmConfig,
- from vllm.distributed.eplb.eplb_state import EplbState
- from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-                                           has_kv_transfer_group)
--from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
- from vllm.distributed.parallel_state import (
-     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
-     prepare_communication_buffer_for_model)
--from vllm.forward_context import (DPMetadata, get_forward_context,
--                                  set_forward_context)
-+from vllm.forward_context import DPMetadata, set_forward_context
- from vllm.logger import init_logger
- from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
- from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-@@ -66,6 +63,8 @@ from vllm.v1.spec_decode.medusa import MedusaProposer
- from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
- from vllm.v1.spec_decode.ngram_proposer import NgramProposer
- from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
-+from vllm.v1.worker.kv_connector_model_runner_mixin import (
-+    KVConnectorModelRunnerMixin)
- from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
- 
- from ..sample.logits_processor import LogitsProcessorManager
-@@ -88,7 +87,7 @@ else:
- logger = init_logger(__name__)
- 
- 
--class GPUModelRunner(LoRAModelRunnerMixin):
-+class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
- 
-     def __init__(
-         self,
-@@ -125,6 +124,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
- 
-         self.is_multimodal_model = model_config.is_multimodal_model
-         self.is_pooling_model = model_config.pooler_config is not None
-+        self.is_encoder_only_model = False
-         self.model_supports_multimodal_raw_input = (
-             model_config.model_supports_multimodal_raw_input)
-         self.max_model_len = model_config.max_model_len
-@@ -177,6 +177,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+@@ -246,6 +246,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
          # NOTE(Jiayi): currently we put the entire draft model on
          # the last PP rank. This is not ideal if there are many
          # layers in the draft model.
@@ -16968,9 +16750,9 @@ index a5bf197ba..f17c2fa1b 100644
          if self.speculative_config and get_pp_group().is_last_rank:
              if self.speculative_config.method == "ngram":
                  self.drafter = NgramProposer(self.vllm_config)
-@@ -313,6 +316,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                                         pin_memory=self.pin_memory)
-         self.seq_lens_np = self.seq_lens_cpu.numpy()
+@@ -362,6 +365,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                                        self.max_num_tokens),
+                                    dtype=np.int64)
  
 +        # this is XPU specific
 +        self.seq_start_loc = torch.zeros(self.max_num_reqs + 1,
@@ -16985,146 +16767,79 @@ index a5bf197ba..f17c2fa1b 100644
          # Layer pairings for cross-layer KV sharing.
          # If an Attention layer `layer_name` is in the keys of this dict, it
          # means this layer will perform attention using the keys and values
-@@ -703,6 +716,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-             num_scheduled_tokens)
+@@ -944,6 +957,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+         seq_lens = self.seq_lens.gpu[:num_reqs]
+         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
  
 +        # for xpu
-+        seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-+                    num_scheduled_tokens)
 +        self.seq_start_loc_np[0] = 0
-+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
++        np.cumsum(self.seq_lens.np[:num_reqs],
++                  out=self.seq_start_loc_np[1:num_reqs + 1])
 +        self.seq_start_loc[:num_reqs + 1].copy_(
 +            self.seq_start_loc_cpu[:num_reqs + 1], non_blocking=True)
 +
          # Copy the tensors to the GPU.
-         self.input_ids[:total_num_scheduled_tokens].copy_(
-             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-@@ -734,6 +755,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         spec_decode_common_attn_metadata = None
+         self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+ 
+@@ -967,6 +987,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             logits_indices = query_start_loc[1:] - 1
+             num_draft_tokens = None
+             spec_decode_metadata = None
++            self.num_draft_tokens.gpu = None
++            self.num_accepted_tokens.gpu = None
+         else:
+             # Get the number of draft tokens for each request.
+             # Iterate over the dictionary rather than all requests since not all
+@@ -982,6 +1004,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             logits_indices = spec_decode_metadata.logits_indices
+             self.num_draft_tokens.np[:num_reqs] = num_draft_tokens
+             self.num_draft_tokens.np[num_reqs:].fill(0)
++            if self.num_draft_tokens.gpu is None:
++                self.num_draft_tokens.gpu = self.num_draft_tokens.cpu.to(self.device)
+             self.num_draft_tokens.copy_to_gpu()
+ 
+         logits_indices_padded = None
+@@ -1001,6 +1025,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+             self.num_accepted_tokens.np[:num_reqs] = (
+                 self.input_batch.num_accepted_tokens_cpu[:num_reqs])
+             self.num_accepted_tokens.np[num_reqs:].fill(1)
++            if self.num_accepted_tokens.gpu is None:
++                self.num_accepted_tokens.gpu = self.num_accepted_tokens.cpu.to(self.device)
+             self.num_accepted_tokens.copy_to_gpu()
  
-         attn_metadata: dict[str, Any] = {}
-+
-+        # Prepare encoder attention metadata separately
-+        # (encoder layers are not in KV cache groups)
-+        if self.is_encoder_only_model:
-+            common_attn_metadata, encoder_attn_metadata = \
-+                self._build_encoder_only_attn_metadata(
-+                scheduler_output)
-+
-+            # Add encoder attention metadata for all encoder layers
-+            attention_layers = get_layers_from_vllm_config(
-+                self.vllm_config, Attention)
-+            for layer_name, attn_module in attention_layers.items():
-+                if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-+                    attn_metadata[layer_name] = encoder_attn_metadata
-+
          # Prepare the attention metadata for each KV cache group and make layers
-         # in the same group share the same metadata.
-         for kv_cache_group_id, kv_cache_group_spec in enumerate(
-@@ -750,6 +786,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+@@ -1041,6 +1067,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
              common_attn_metadata = CommonAttentionMetadata(
-                 query_start_loc=self.query_start_loc[:num_reqs + 1],
-                 query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+                 query_start_loc=query_start_loc,
+                 query_start_loc_cpu=query_start_loc_cpu,
 +                seq_start_loc=self.seq_start_loc[:num_reqs + 1],
 +                seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-                 seq_lens=self.seq_lens[:num_reqs],
-                 seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-                 num_computed_tokens_cpu=self.input_batch.
-@@ -759,6 +797,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                 max_query_len=max_num_scheduled_tokens,
-                 block_table_tensor=blk_table_tensor,
-                 slot_mapping=slot_mapping,
-+                causal=True,
-             )
- 
-             if self.speculative_config and \
-@@ -1357,7 +1396,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                 # Return empty ModelRunnerOutput if there's no work to do.
-                 return EMPTY_MODEL_RUNNER_OUTPUT
- 
--            return self.kv_connector_no_forward(scheduler_output)
-+            return self.kv_connector_no_forward(scheduler_output,
-+                                                self.vllm_config)
- 
-         # Prepare the decoder inputs.
-         (attn_metadata, attention_cuda_graphs, logits_indices,
-@@ -1745,52 +1785,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-             spec_token_ids = draft_token_ids.tolist()
-         return spec_token_ids
- 
--    @staticmethod
--    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
--        # Update KVConnector with the KVConnector metadata forward().
--        if has_kv_transfer_group():
--            kv_connector = get_kv_transfer_group()
--            assert isinstance(kv_connector, KVConnectorBase_V1)
--            assert scheduler_output.kv_connector_metadata is not None
--            kv_connector.bind_connector_metadata(
--                scheduler_output.kv_connector_metadata)
--
--            # Background KV cache transfers happen here.
--            # These transfers are designed to be async and the requests
--            # involved may be disjoint from the running requests.
--            # Do this here to save a collective_rpc.
--            kv_connector.start_load_kv(get_forward_context())
--
--    @staticmethod
--    def maybe_wait_for_kv_save() -> None:
--        if has_kv_transfer_group():
--            get_kv_transfer_group().wait_for_save()
--
--    @staticmethod
--    def get_finished_kv_transfers(
--        scheduler_output: "SchedulerOutput",
--    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
--        if has_kv_transfer_group():
--            return get_kv_transfer_group().get_finished(
--                scheduler_output.finished_req_ids)
--        return None, None
--
--    def kv_connector_no_forward(
--            self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
--        # KV send/recv even if no work to do.
--        with set_forward_context(None, self.vllm_config):
--            self.maybe_setup_kv_connector(scheduler_output)
--            finished_sending, finished_recving = (
--                self.get_finished_kv_transfers(scheduler_output))
--
--        if not finished_sending and not finished_recving:
--            return EMPTY_MODEL_RUNNER_OUTPUT
--
--        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
--        output.finished_sending = finished_sending
--        output.finished_recving = finished_recving
--        return output
--
-     def propose_ngram_draft_token_ids(
-         self,
-         sampled_token_ids: list[list[int]],
-@@ -2111,6 +2105,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     query_start_loc=self.query_start_loc[:num_reqs + 1],
-                     query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
+                 seq_lens=seq_lens,
+                 seq_lens_cpu=seq_lens_cpu,
+                 num_computed_tokens_cpu=num_computed_tokens_cpu,
+@@ -2734,6 +2762,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
+                     query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
                                                                   1],
 +                    seq_start_loc=self.seq_start_loc[:num_reqs + 1],
 +                    seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-                     seq_lens=self.seq_lens[:num_reqs],
-                     seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                     seq_lens=self.seq_lens.gpu[:num_reqs],
+                     seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
                      num_computed_tokens_cpu=self.input_batch.
-@@ -2121,7 +2117,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     block_table_tensor=self.input_batch.block_table[
-                         kv_cache_group_id].get_device_tensor()[:num_reqs],
-                     slot_mapping=self.input_batch.
--                    block_table[kv_cache_group_id].slot_mapping[:num_tokens])
-+                    block_table[kv_cache_group_id].slot_mapping[:num_tokens],
-+                    causal=True)
- 
-                 attn_metadata_i = self.attn_metadata_builders[
-                     kv_cache_group_id].build_for_cudagraph_capture(
-@@ -2410,11 +2407,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
- 
-             # Cache the dummy encoder outputs.
-             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+@@ -2997,6 +3027,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     max_mm_items_per_batch = mm_budget \
+                         .max_items_per_batch_by_modality[dummy_modality]
+ 
++                    if self.model_config.hf_config.model_type == "minicpmv":
++                        max_mm_items_per_batch = 1
++
+                     logger.info(
+                         "Encoder cache will be initialized with a budget of "
+                         "%s tokens, and profiled with %s %s items of the "
+@@ -3025,11 +3058,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+                     # Cache the dummy encoder outputs.
+                     self.encoder_cache["tmp"] = dict(
+                         enumerate(dummy_encoder_outputs))
 -
          # Add `is_profile` here to pre-allocate communication buffers
          hidden_states, last_hidden_states \
@@ -17134,201 +16849,11 @@ index a5bf197ba..f17c2fa1b 100644
              if self.is_pooling_model:
                  output = self._dummy_pooler_run(hidden_states)
              else:
-@@ -2485,6 +2481,49 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
-                     elapsed_time, cuda_graph_size / (1 << 30))
- 
-+    def _initialize_single_attn_backend(
-+        self, kv_cache_spec: KVCacheSpec
-+    ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
-+        if isinstance(kv_cache_spec, AttentionSpec):
-+            attn_backend_i = get_attn_backend(
-+                kv_cache_spec.head_size,
-+                self.dtype,
-+                kv_cache_spec.dtype,
-+                kv_cache_spec.block_size,
-+                self.model_config.is_attention_free,
-+                use_mla=kv_cache_spec.use_mla,
-+            )
-+            if attn_backend_i is None:
-+                error_msg = (f"Error with get_attn_backend: "
-+                             f"{kv_cache_spec.head_size=}, "
-+                             f"{self.dtype=}, {kv_cache_spec.dtype=}, "
-+                             f"{kv_cache_spec.block_size=}, "
-+                             f"{self.model_config.is_attention_free=}, "
-+                             f"{kv_cache_spec.use_mla=}")
-+                logger.error(error_msg)
-+                raise NotImplementedError(
-+                    "Non-Attention backend is not supported by V1 "
-+                    "GPUModelRunner.")
-+        elif isinstance(kv_cache_spec, MambaSpec):
-+            attn_backend_i = Mamba2AttentionBackend
-+        else:
-+            raise ValueError(
-+                f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-+
-+        attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-+            kv_cache_spec,
-+            self.vllm_config,
-+            self.device,
-+        )
-+
-+        if (self.full_cuda_graph
-+                and not attn_metadata_builder_i.full_cudagraph_supported):
-+            raise ValueError(
-+                f"Full CUDAGraph not supported for "
-+                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-+                f"full_cuda_graph or use a different attention backend.")
-+        return attn_backend_i, attn_metadata_builder_i
-+
-     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
-         """
-         Initialize the attention backends and attention metadata builders.
-@@ -2495,48 +2534,45 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-         for i, kv_cache_group_spec in enumerate(
-                 kv_cache_config.kv_cache_groups):
-             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
--            if isinstance(kv_cache_spec, AttentionSpec):
--                attn_backend_i = get_attn_backend(
--                    kv_cache_spec.head_size,
--                    self.dtype,
--                    kv_cache_spec.dtype,
--                    kv_cache_spec.block_size,
--                    self.model_config.is_attention_free,
--                    use_mla=kv_cache_spec.use_mla,
--                )
--                if attn_backend_i is None:
--                    error_msg = (f"Error with get_attn_backend: "
--                                 f"{kv_cache_spec.head_size=}, "
--                                 f"{self.dtype=}, {kv_cache_spec.dtype=}, "
--                                 f"{kv_cache_spec.block_size=}, "
--                                 f"{self.model_config.is_attention_free=}, "
--                                 f"{kv_cache_spec.use_mla=}")
--                    logger.error(error_msg)
--                    raise NotImplementedError(
--                        "Non-Attention backend is not supported by V1 "
--                        "GPUModelRunner.")
--            elif isinstance(kv_cache_spec, MambaSpec):
--                attn_backend_i = Mamba2AttentionBackend
--            else:
--                raise ValueError(
--                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
--
--            attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
--                kv_cache_spec,
--                self.vllm_config,
--                self.device,
--            )
--
--            if (self.full_cuda_graph
--                    and not attn_metadata_builder_i.full_cudagraph_supported):
--                raise ValueError(
--                    f"Full CUDAGraph not supported for "
--                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
--                    f"full_cuda_graph or use a different attention backend.")
- 
-+            attn_backend_i, attn_metadata_builder_i = \
-+                self._initialize_single_attn_backend(kv_cache_spec)
-             self.attn_backends.append(attn_backend_i)
-             self.attn_metadata_builders.append(attn_metadata_builder_i)
- 
-+        if len(self.attn_backends) > 0:
-+            return
-+
-+        # Check if model is encoder-only
-+        block_size = self.vllm_config.cache_config.block_size
-+        use_mla = self.vllm_config.model_config.use_mla
-+        attn_specs = list[AttentionSpec]()
-+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
-+        for attn_module in attn_layers.values():
-+
-+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-+                assert attn_module.sliding_window is None, "Sliding "
-+                "window attention is not supported for encoder-only models"
-+
-+                attn_specs.append(
-+                    FullAttentionSpec(block_size=block_size,
-+                                      num_kv_heads=attn_module.num_kv_heads,
-+                                      head_size=attn_module.head_size,
-+                                      dtype=self.kv_cache_dtype,
-+                                      use_mla=use_mla))
-+            else:
-+                raise ValueError("Expected only encoder-only layers")
-+
-+        if len(attn_specs) > 0:
-+            assert len(attn_specs) == len(attn_layers), \
-+                "All or none of the layers are expected to be encoder-only"
-+
-+            attn_backend, attn_metadata_builder = \
-+                self._initialize_single_attn_backend(attn_specs[0])
-+            self.attn_backends.append(attn_backend)
-+            self.attn_metadata_builders.append(attn_metadata_builder)
-+            self.is_encoder_only_model = True
-+
-     def may_reinitialize_input_batch(self,
-                                      kv_cache_config: KVCacheConfig) -> None:
-         """
-@@ -2852,3 +2888,55 @@ class GPUModelRunner(LoRAModelRunnerMixin):
-                     page_size_padded=page_size_padded)
- 
-         return kv_cache_spec
-+
-+    def _build_encoder_only_attn_metadata(
-+            self, scheduler_output: "SchedulerOutput") -> \
-+                tuple[CommonAttentionMetadata, Any]:
-+        """Prepare encoder attention metadata for encoder-only models.
-+
-+        Args:
-+            scheduler_output: Scheduler output
-+
-+        Returns:
-+            dict[str, Any]: Encoder attention metadata
-+        """
-+        num_reqs = self.input_batch.num_reqs
-+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-+
-+        # Get the number of scheduled tokens for each request.
-+        req_ids = self.input_batch.req_ids
-+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-+        max_num_scheduled_tokens = max(tokens)
-+
-+        # Use the first attention metadata builder
-+        # to create encoder attention metadata
-+        builder = self.attn_metadata_builders[0]
-+
-+        dummy_block_table = torch.zeros((num_reqs, 1),
-+                                        dtype=torch.int32,
-+                                        device=self.device)
-+        dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
-+                                         dtype=torch.int32,
-+                                         device=self.device)
-+
-+        common_metadata = CommonAttentionMetadata(
-+            query_start_loc=self.query_start_loc[:num_reqs + 1],
-+            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-+            seq_start_loc=self.seq_start_loc[:num_reqs + 1],
-+            seq_start_loc_cpu=self.seq_start_loc_cpu[:num_reqs + 1],
-+            seq_lens=self.seq_lens[:num_reqs],
-+            seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-+            num_computed_tokens_cpu=self.input_batch.
-+            num_computed_tokens_cpu_tensor[:num_reqs],
-+            num_reqs=num_reqs,
-+            num_actual_tokens=total_num_scheduled_tokens,
-+            max_query_len=max_num_scheduled_tokens,
-+            block_table_tensor=dummy_block_table,
-+            slot_mapping=dummy_slot_mapping,
-+            causal=False,
-+        )
-+
-+        return common_metadata, builder.build(
-+            common_prefix_len=0,  # No cascade for encoder
-+            common_attn_metadata=common_metadata,
-+        )
 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
-index 522946351..472eab749 100644
+index 37dd431fd..5a4e63177 100644
 --- a/vllm/v1/worker/gpu_worker.py
 +++ b/vllm/v1/worker/gpu_worker.py
-@@ -297,7 +297,7 @@ class Worker(WorkerBase):
+@@ -395,7 +395,7 @@ class Worker(WorkerBase):
          # fragmentation issue.
          # NOTE: This is called after `capture_model` on purpose to prevent
          # memory buffers from being cleared by `torch.cuda.empty_cache`.
@@ -17337,7 +16862,7 @@ index 522946351..472eab749 100644
              max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                 self.scheduler_config.max_num_batched_tokens)
  
-@@ -309,7 +309,7 @@ class Worker(WorkerBase):
+@@ -407,7 +407,7 @@ class Worker(WorkerBase):
                  )
              if self.model_runner.is_pooling_model:
                  self.model_runner._dummy_pooler_run(hidden_states)
@@ -17346,341 +16871,38 @@ index 522946351..472eab749 100644
                  self.model_runner._dummy_sampler_run(
                      hidden_states=last_hidden_states)
  
-diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
-new file mode 100644
-index 000000000..5a3186058
---- /dev/null
-+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
-@@ -0,0 +1,70 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-+"""
-+Define KV connector functionality mixin for model runners.
-+"""
-+import copy
-+from typing import TYPE_CHECKING, Optional
-+
-+from vllm.config import VllmConfig
-+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-+                                          has_kv_transfer_group)
-+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
-+from vllm.forward_context import get_forward_context, set_forward_context
-+from vllm.logger import init_logger
-+from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
-+
-+if TYPE_CHECKING:
-+    from vllm.v1.core.sched.output import SchedulerOutput
-+
-+logger = init_logger(__name__)
-+
-+
-+# Defined as a kv connector functionality mixin for ModelRunner (GPU, TPU)
-+class KVConnectorModelRunnerMixin:
-+
-+    @staticmethod
-+    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
-+        # Update KVConnector with the KVConnector metadata forward().
-+        if has_kv_transfer_group():
-+            kv_connector = get_kv_transfer_group()
-+            assert isinstance(kv_connector, KVConnectorBase_V1)
-+            assert scheduler_output.kv_connector_metadata is not None
-+            kv_connector.bind_connector_metadata(
-+                scheduler_output.kv_connector_metadata)
-+
-+            # Background KV cache transfers happen here.
-+            # These transfers are designed to be async and the requests
-+            # involved may be disjoint from the running requests.
-+            # Do this here to save a collective_rpc.
-+            kv_connector.start_load_kv(get_forward_context())
-+
-+    @staticmethod
-+    def maybe_wait_for_kv_save() -> None:
-+        if has_kv_transfer_group():
-+            get_kv_transfer_group().wait_for_save()
-+
-+    @staticmethod
-+    def get_finished_kv_transfers(
-+        scheduler_output: "SchedulerOutput",
-+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
-+        if has_kv_transfer_group():
-+            return get_kv_transfer_group().get_finished(
-+                scheduler_output.finished_req_ids)
-+        return None, None
-+
-+    def kv_connector_no_forward(self, scheduler_output: "SchedulerOutput",
-+                                vllm_config: VllmConfig) -> ModelRunnerOutput:
-+        # KV send/recv even if no work to do.
-+        with set_forward_context(None, vllm_config):
-+            self.maybe_setup_kv_connector(scheduler_output)
-+            finished_sending, finished_recving = (
-+                self.get_finished_kv_transfers(scheduler_output))
-+
-+        if not finished_sending and not finished_recving:
-+            return EMPTY_MODEL_RUNNER_OUTPUT
-+
-+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-+        output.finished_sending = finished_sending
-+        output.finished_recving = finished_recving
-+        return output
-diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
-index 3bb033f14..e8c800845 100644
---- a/vllm/v1/worker/tpu_model_runner.py
-+++ b/vllm/v1/worker/tpu_model_runner.py
-@@ -3,7 +3,7 @@
- import bisect
- import gc
- import time
--from typing import TYPE_CHECKING, Any, Optional, cast
-+from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
- from unittest.mock import patch
- 
- import numpy as np
-@@ -20,6 +20,8 @@ from vllm.attention.layer import Attention
- from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
- from vllm.config import (ParallelConfig, VllmConfig,
-                          get_layers_from_vllm_config, update_config)
-+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
-+                                          has_kv_transfer_group)
- from vllm.forward_context import set_forward_context
- from vllm.logger import init_logger
- from vllm.lora.layers import BaseLayerWithLoRA
-@@ -46,6 +48,8 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists,
-                              LogprobsTensors, ModelRunnerOutput)
- from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
- from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
-+from vllm.v1.worker.kv_connector_model_runner_mixin import (
-+    KVConnectorModelRunnerMixin)
- from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
- from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
- 
-@@ -97,7 +101,7 @@ MIN_NUM_SEQS = 8
- # The dummy_run should be comprehensive, ensuring all potential input shapes and
- # branch predictions are included as subgraph inputs to facilitate
- # pre-compilation.
--class TPUModelRunner(LoRAModelRunnerMixin):
-+class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
- 
-     def __init__(
-         self,
-@@ -971,8 +975,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-         # Update cached state
-         self._update_states(scheduler_output)
-         if not scheduler_output.total_num_scheduled_tokens:
--            # Return empty ModelRunnerOutput if there's no work to do.
--            return EMPTY_MODEL_RUNNER_OUTPUT
-+            if not has_kv_transfer_group():
-+                # Return empty ModelRunnerOutput if there's no work to do.
-+                return EMPTY_MODEL_RUNNER_OUTPUT
-+
-+            return self.kv_connector_no_forward(scheduler_output,
-+                                                self.vllm_config)
- 
-         if self.is_multimodal_model:
-             # Run the multimodal encoder if any.
-@@ -986,6 +994,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-         start_index = 0
-         combined_selected_tokens: list[torch.Tensor] = []
-         combined_logprobs: list[LogprobsLists] = []
-+
-+        # NOTE: setup current batch's metadata for kv connector.
-+        # Currently, only verified with NixlConnector
-+        with set_forward_context(None, self.vllm_config):
-+            self.maybe_setup_kv_connector(scheduler_output)
-+
-         while start_index < self.input_batch.num_reqs:
-             attn_metadata, logits_indices, padded_num_reqs, num_reqs,\
-                 end_index = self._prepare_inputs(scheduler_output, start_index)
-@@ -1032,6 +1046,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
- 
-             start_index = end_index
- 
-+        # NOTE: current kv load and save get h2d/d2h copies involved.
-+        # Those copies are blocking. Once they become async., kv_save
-+        # should be called right after each single forward pass,
-+        # instead of the forwards of the entire input batch.
-+        self.maybe_wait_for_kv_save()
-+        finished_sending, finished_recving = (
-+            self.get_finished_kv_transfers(scheduler_output))
-+
-         selected_token_ids = torch.cat(combined_selected_tokens, dim=0)
-         if tpu_sampling_metadata.logprobs:
- 
-@@ -1126,6 +1148,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-             logprobs=logprobs_lists,
-             prompt_logprobs_dict=prompt_logprobs_dict,
-             pooler_output=[],
-+            finished_sending=finished_sending,
-+            finished_recving=finished_recving,
-         )
- 
-         # Check there are no new graphs compiled - all the graphs should be
-@@ -1637,6 +1661,10 @@ class TPUModelRunner(LoRAModelRunnerMixin):
-             for cache in self.kv_caches:
-                 xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
- 
-+        if has_kv_transfer_group():
-+            get_kv_transfer_group().register_kv_caches(kv_caches)
-+            get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
-+
-     def reset_dynamo_cache(self):
-         if self.is_multimodal_model:
-             compiled_model = self.model.get_language_model().model
-@@ -1851,6 +1879,75 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
-     return paddings[index]
- 
- 
-+def _make_src_and_dst_indices(
-+    src_block_ids: list[int],
-+    dst_block_ids: list[int],
-+    src_device: Union[torch.device, str],
-+    dst_device: Union[torch.device, str],
-+) -> tuple[torch.Tensor, torch.Tensor]:
-+    src_indices = torch.tensor(src_block_ids,
-+                               device=src_device,
-+                               dtype=torch.int64)
-+    dst_indices = torch.tensor(dst_block_ids,
-+                               device=dst_device,
-+                               dtype=torch.int64)
-+    return src_indices, dst_indices
-+
-+
-+@torch.compile(backend="openxla")
-+def _insert_blocks_to_tpu(
-+    cpu_cache: torch.Tensor,
-+    tpu_cache: torch.Tensor,
-+    cpu_block_indices: torch.Tensor,
-+    tpu_block_indices: torch.Tensor,
-+) -> None:
-+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-+    tpu_cache[tpu_block_indices] = cpu_cache[cpu_block_indices].to(
-+        tpu_cache.device)
-+
-+
-+@torch.compile(backend="openxla")
-+def _swap_out_tpu_blocks(
-+    tpu_cache: torch.Tensor,
-+    cpu_cache: torch.Tensor,
-+    tpu_block_indices: torch.Tensor,
-+    cpu_block_indices: torch.Tensor,
-+) -> None:
-+    """ tpu blocks to cpu blocks"""
-+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-+    cpu_cache[cpu_block_indices] = tpu_cache[tpu_block_indices].cpu()
-+
-+
-+def copy_kv_blocks(
-+    src_kv_caches: dict[str, torch.Tensor],
-+    dst_kv_caches: dict[str, torch.Tensor],
-+    src_block_ids: list[int],
-+    dst_block_ids: list[int],
-+    direction: Literal["h2d", "d2h"],
-+) -> None:
-+    """Copy kv blocks between different buffers."""
-+    if not src_kv_caches or not dst_kv_caches or \
-+       not src_block_ids or not dst_block_ids or \
-+       len(src_block_ids) != len(dst_block_ids):
-+        return
-+
-+    src_device = next(iter(src_kv_caches.values())).device
-+    dst_device = next(iter(dst_kv_caches.values())).device
-+
-+    src_indices, dst_indices = _make_src_and_dst_indices(
-+        src_block_ids=src_block_ids,
-+        dst_block_ids=dst_block_ids,
-+        src_device=src_device,
-+        dst_device=dst_device)
-+
-+    _copy_fn = _insert_blocks_to_tpu if direction == "h2d" else \
-+               _swap_out_tpu_blocks
-+    for layer_name in src_kv_caches:
-+        src_tensor = src_kv_caches[layer_name]
-+        dst_tensor = dst_kv_caches[layer_name]
-+        _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
-+
-+
- def _get_padded_num_kv_cache_update_slices(
-         num_tokens: int, max_num_reqs: int, page_size: int,
-         num_slices_per_kv_cache_update_block: int) -> int:
-diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
-index 648d9c319..254b058d2 100644
---- a/vllm/v1/worker/tpu_worker.py
-+++ b/vllm/v1/worker/tpu_worker.py
-@@ -12,9 +12,11 @@ import torch_xla.debug.profiler as xp
- import torch_xla.runtime as xr
- 
- import vllm.envs as envs
--from vllm.config import ParallelConfig, VllmConfig
-+from vllm.config import VllmConfig
- from vllm.distributed import (ensure_model_parallel_initialized,
-                               init_distributed_environment)
-+from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
-+                                          has_kv_transfer_group)
- from vllm.logger import init_logger
- from vllm.lora.request import LoRARequest
- from vllm.model_executor import set_random_seed
-@@ -118,7 +120,7 @@ class TPUWorker:
- 
-         # Initialize the distributed environment.
-         self._init_tpu_worker_distributed_environment(
--            self.parallel_config, self.rank, self.distributed_init_method,
-+            self.vllm_config, self.rank, self.distributed_init_method,
-             self.local_rank)
- 
-         # Device initialization should happen after initializing
-@@ -242,7 +244,9 @@ class TPUWorker:
-         scheduler_output: "SchedulerOutput",
-     ) -> Optional[ModelRunnerOutput]:
-         output = self.model_runner.execute_model(scheduler_output)
--        return output if self.is_driver_worker else None
-+        # every worker's output is needed when kv_transfer_group is setup
-+        return output if self.is_driver_worker or has_kv_transfer_group(
-+        ) else None
- 
-     def profile(self, is_start: bool = True):
-         if self.rank < 1:
-@@ -294,7 +298,7 @@ class TPUWorker:
- 
-     def _init_tpu_worker_distributed_environment(
-         self,
--        parallel_config: ParallelConfig,
-+        vllm_config: VllmConfig,
-         rank: int,
-         distributed_init_method: Optional[str] = None,
-         local_rank: int = -1,
-@@ -306,6 +310,7 @@ class TPUWorker:
-         # the input objects on CPU. The all-reduce and all-gather ops on TPU
-         # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-         # own context.
-+        parallel_config = vllm_config.parallel_config
-         init_distributed_environment(
-             world_size=parallel_config.world_size,
-             rank=rank,
-@@ -317,6 +322,8 @@ class TPUWorker:
-             parallel_config.tensor_parallel_size,
-             parallel_config.pipeline_parallel_size)
- 
-+        ensure_kv_transfer_initialized(vllm_config)
-+
- 
- try:
-     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
+index fb892211f..ea69bc2c8 100644
+--- a/vllm/v1/worker/xpu_model_runner.py
++++ b/vllm/v1/worker/xpu_model_runner.py
+@@ -47,6 +47,10 @@ def _torch_cuda_wrapper():
+     try:
+         # replace cuda Event with xpu Event, this should work by default
+         torch.cuda.Event = torch.xpu.Event
++        torch.cuda.Stream = torch.xpu.Stream
++        torch.cuda.current_stream = torch.xpu.current_stream
++        torch.cuda.stream = torch.xpu.stream
++        torch.cuda.default_stream = torch.xpu.current_stream
+         yield
+     finally:
+         # if anything goes wrong, just patch it with a placeholder
 diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
-index c7885694f..85b302609 100644
+index 7355206f3..ff7870be6 100644
 --- a/vllm/v1/worker/xpu_worker.py
 +++ b/vllm/v1/worker/xpu_worker.py
-@@ -72,9 +72,11 @@ class XPUWorker(Worker):
+@@ -83,9 +83,11 @@ class XPUWorker(Worker):
      def determine_available_memory(self) -> int:
          """Profiles the peak memory usage of the model to determine how many
          KV blocks may be allocated without OOMs.
 +
          The engine will first conduct a profiling of the existing memory usage.
-         Then, it calculate the maximum possible number of GPU and CPU blocks
+         Then, it calculates the maximum possible number of GPU and CPU blocks
          that can be allocated with the remaining free memory.
 +
          .. tip::
              You may limit the usage of GPU memory
              by adjusting the `gpu_memory_utilization` parameter.
-@@ -82,51 +84,36 @@ class XPUWorker(Worker):
+@@ -93,51 +95,37 @@ class XPUWorker(Worker):
          # Profile the memory usage of the model and get the maximum number of
          # cache blocks that can be allocated with the remaining free memory.
          torch.xpu.empty_cache()
@@ -17702,6 +16924,7 @@ index c7885694f..85b302609 100644
 +        # Calculate the number of blocks that can be allocated with the
 +        # profiled peak memory.
 +        torch.xpu.synchronize()
++        # used_memory = torch.xpu.memory_allocated()
 +        used_memory = torch.xpu.memory_reserved()
 +        total_gpu_memory = torch.xpu.get_device_properties(
 +            self.local_rank).total_memory
@@ -17746,11 +16969,11 @@ index c7885694f..85b302609 100644
          return int(available_kv_cache_memory)
  
      def init_device(self):
-@@ -141,11 +128,9 @@ class XPUWorker(Worker):
+@@ -153,11 +141,9 @@ class XPUWorker(Worker):
              raise RuntimeError(
                  f"Not support device type: {self.device_config.device}")
  
--        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd")
+-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
          ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
          ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
                                           str(self.parallel_config.world_size))