diff --git a/vllm/Miner-U/README.md b/vllm/Miner-U/README.md
index 79e2c86..d79bb03 100644
--- a/vllm/Miner-U/README.md
+++ b/vllm/Miner-U/README.md
@@ -53,3 +53,5 @@ mineru-gradio --server-name 0.0.0.0 --server-port 7860
```
Refer to [here](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#_2) for more details.
+
+### Refer to [here](https://github.com/intel/llm-scaler/tree/main/vllm#243-mineru-26-support) for new version 2.6.1 of mineru-vllm, which has performance improvements.
diff --git a/vllm/README.md b/vllm/README.md
index f81ab0c..36e81d2 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -2278,16 +2278,9 @@ TORCH_LLM_ALLREDUCE=1 VLLM_USE_V1=1 CCL_ZE_IPC_EXCHANGE=pidfd VLLM_ALLOW_LONG_M
---
-### 2.4.3 MinerU 2.5 Support
+### 2.4.3 MinerU 2.6 Support
-This guide shows how to launch the MinerU 2.5 model using the vLLM inference backend.
-
-#### Install MinerU Core
-
-First, install the core MinerU package:
-```bash
-pip install mineru[core]
-```
+This guide shows how to launch the MinerU 2.6 model using the vLLM inference backend.
#### Start the MinerU Service
@@ -2307,7 +2300,10 @@ python3 -m vllm.entrypoints.openai.api_server \
--trust-remote-code \
--gpu-memory-util 0.85 \
--no-enable-prefix-caching \
+ --max-num-batched-tokens=32768 \
+ --max-model-len=32768 \
--block-size 64 \
+ --max-num-seqs 256 \
--served-model-name MinerU \
--tensor-parallel-size 1 \
--pipeline-parallel-size 1 \
@@ -2328,6 +2324,31 @@ To verify mineru
mineru -p /llm/MinerU/demo/pdfs/small_ocr.pdf -o ./ -b vlm-http-client -u http://127.0.0.1:8000
```
+2.Using by gradio
+
+```bash
+mineru-gradio --server-name 0.0.0.0 --server-port 8002
+```
+
+```python
+from gradio_client import Client, handle_file
+
+client = Client("http://localhost:8002/")
+result = client.predict(
+ file_path=handle_file('/llm/MinerU/demo/pdfs/small_ocr.pdf'),
+ end_pages=500,
+ is_ocr=False,
+ formula_enable=True,
+ table_enable=True,
+ language="ch",
+ backend="vlm-http-client",
+ url="http://localhost:8000",
+ api_name="/to_markdown"
+)
+print(result)
+```
+More details you can refer to gradio's [api guide](http://your_ip:8002/?view=api)
+
---
### 2.5 Omni Model Support
@@ -2362,7 +2383,7 @@ python3 -m vllm.entrypoints.openai.api_server \
After starting the vLLM service, you can follow this link to use it
-#### [Qwen2.5-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
+#### [Qwen-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage)
```bash
curl http://localhost:8000/v1/chat/completions \
@@ -2383,6 +2404,25 @@ An example responce is listed below:
```json
{"id":"chatcmpl-xxx","object":"chat.completion","model":"Qwen2.5-Omni-7B","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the image is \"TONGYI Qwen\". The sound in the audio is a cough.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":156,"total_tokens":180,"completion_tokens":24,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_transfer_params":null}
```
+
+For video input, one can input like this:
+
+```bash
+curl -sS http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+ "model": "Qwen3-Omni-30B-A3B-Instruct",
+ "temperature": 0,
+ "max_tokens": 1024,
+ "messages": [{
+ "role": "user",
+ "content": [
+ { "type": "text", "text": "Please describe the video comprehensively as much as possible." },
+ { "type": "video_url", "video_url": { "url": "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" } }
+ ]
+ }]
+ }'
+```
+
+
---
### 2.6 Data Parallelism (DP)
diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile
index 2ba001c..2018d05 100644
--- a/vllm/docker/Dockerfile
+++ b/vllm/docker/Dockerfile
@@ -16,7 +16,6 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
RUN apt-get update -y && \
# apt-get install -y software-properties-common && \
# add-apt-repository ppa:deadsnakes/ppa && \
- apt-get update -y && \
apt-get install -y python3.12 python3.12-dev python3-pip && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
@@ -34,7 +33,6 @@ RUN apt-get update -y && \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
- apt-get update -y && \
apt-get install -y intel-oneapi-dpcpp-ct=2025.1.0-452 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
@@ -51,7 +49,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
RUN python3 -m pip config set global.break-system-packages true
# Clone + patch vllm
-RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
+RUN git clone -b v0.10.2 https://github.com/vllm-project/vllm.git && \
cd vllm && \
git apply /tmp/vllm_for_multi_arc.patch && \
pip install --no-cache-dir -r requirements/xpu.txt && \
@@ -59,12 +57,12 @@ RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
python3 setup.py install
# Clone + patch miner-U
-RUN git clone https://github.com/opendatalab/MinerU.git && \
+RUN git clone -b release-2.6.2 https://github.com/opendatalab/MinerU.git && \
cd MinerU && \
- git checkout de41fa58590263e43b783fe224b6d07cae290a33 && \
- git apply /tmp/miner-u.patch && \
- pip install -e .[core] && \
- sed -i 's/select_device(self.args.device, verbose=verbose)/torch.device(self.args.device)/' /usr/local/lib/python3.12/dist-packages/ultralytics/engine/predictor.py
+ pip install -e .[core] --no-deps && \
+ pip install mineru_vl_utils==0.1.14 gradio gradio-client gradio-pdf && \
+ sed -i 's/kwargs.get("max_concurrency", 100)/kwargs.get("max_concurrency", 200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py && \
+ sed -i 's/kwargs.get("http_timeout", 600)/kwargs.get("http_timeout", 1200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py
# ======= Add oneCCL build =======
@@ -120,7 +118,9 @@ RUN pip install accelerate hf_transfer 'modelscope!=1.15.0'
# Pin transformers version to avoid conflict in vLLM
-RUN pip install "transformers<4.54.0"
+RUN pip install "transformers==4.57.0" && \
+ pip install librosa soundfile && \
+ pip install mineru[core]==2.5.4
# Set additional environment for production usage
diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch
index e961e2a..c35c5b4 100644
--- a/vllm/patches/vllm_for_multi_arc.patch
+++ b/vllm/patches/vllm_for_multi_arc.patch
@@ -1,5 +1,5 @@
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
-index b98d42aa7..b2a1ebef2 100644
+index 792f355c4..af2c24c4c 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
@@ -10,6 +10,107 @@ index b98d42aa7..b2a1ebef2 100644
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,enforce_eager=true,max_num_batched_tokens=4096" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE"
+diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
+index 8c6ef7817..a1de41652 100644
+--- a/.buildkite/release-pipeline.yaml
++++ b/.buildkite/release-pipeline.yaml
+@@ -1,22 +1,24 @@
+ steps:
+ # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build arm64 wheel - CUDA 12.9"
+- depends_on: ~
+ id: build-wheel-arm64-cuda-12-9
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
++ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh"
+ env:
+ DOCKER_BUILDKIT: "1"
+
++ - block: "Build CUDA 12.8 wheel"
++ key: block-build-cu128-wheel
++
+ - label: "Build wheel - CUDA 12.8"
+- depends_on: ~
++ depends_on: block-build-cu128-wheel
+ id: build-wheel-cuda-12-8
+ agents:
+ queue: cpu_queue_postmerge
+@@ -28,8 +30,12 @@ steps:
+ env:
+ DOCKER_BUILDKIT: "1"
+
+- - label: "Build wheel - CUDA 12.6"
++ - block: "Build CUDA 12.6 wheel"
++ key: block-build-cu126-wheel
+ depends_on: ~
++
++ - label: "Build wheel - CUDA 12.6"
++ depends_on: block-build-cu126-wheel
+ id: build-wheel-cuda-12-6
+ agents:
+ queue: cpu_queue_postmerge
+@@ -96,6 +102,8 @@ steps:
+ depends_on:
+ - create-multi-arch-manifest
+ - build-wheel-cuda-12-8
++ - build-wheel-cuda-12-6
++ - build-wheel-cuda-12-9
+ id: annotate-release-workflow
+ agents:
+ queue: cpu_queue_postmerge
+diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
+index fde48603a..94e0ac239 100755
+--- a/.buildkite/scripts/annotate-release.sh
++++ b/.buildkite/scripts/annotate-release.sh
+@@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+ To download the wheel:
+ \`\`\`
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+-
+ aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
++aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
+ \`\`\`
+
+ To download and upload the image:
+
+ \`\`\`
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-docker push vllm/vllm-openai:latest-x86_64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+-
+-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-docker push vllm/vllm-openai:latest-aarch64
+-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+-
+-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+-docker manifest push vllm/vllm-openai:latest
+-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
++docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
++docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
++docker tag vllm/vllm-openai vllm/vllm-openai:latest
++docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
++docker push vllm/vllm-openai:latest
++docker push vllm/vllm-openai:v${RELEASE_VERSION}
+ \`\`\`
+ EOF
+\ No newline at end of file
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 000000000..aef250abe
@@ -388,78 +489,8 @@ index 000000000..eaa2f332a
+ else
+ echo "✅ All benchmarks passed"
+ fi
-diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
-deleted file mode 100644
-index d5c6b8d43..000000000
---- a/.github/workflows/cleanup_pr_body.yml
-+++ /dev/null
-@@ -1,31 +0,0 @@
--name: Cleanup PR Body
--
--on:
-- pull_request_target:
-- types: [opened, reopened, edited]
--
--permissions:
-- pull-requests: write
--
--jobs:
-- update-description:
-- runs-on: ubuntu-latest
--
-- steps:
-- - name: Checkout repository
-- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--
-- - name: Set up Python
-- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-- with:
-- python-version: '3.12'
--
-- - name: Install Python dependencies
-- run: |
-- python3 -m pip install --upgrade pip
-- python3 -m pip install regex
--
-- - name: Update PR description
-- env:
-- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-- run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
-diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
-deleted file mode 100644
-index 16ae1aadb..000000000
---- a/.github/workflows/reminder_comment.yml
-+++ /dev/null
-@@ -1,27 +0,0 @@
--name: PR Reminder Comment Bot
--permissions:
-- pull-requests: write
--on:
-- pull_request_target:
-- types: [opened]
--jobs:
-- pr_reminder:
-- runs-on: ubuntu-latest
-- steps:
-- - name: Remind to run full CI on PR
-- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-- with:
-- script: |
-- github.rest.issues.createComment({
-- owner: context.repo.owner,
-- repo: context.repo.repo,
-- issue_number: context.issue.number,
-- body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-- '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-- 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-- 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-- 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-- '🚀'
-- })
-- env:
-- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 98ed682fe..5dd6e907c 100644
+index 3f1f9a781..fef10e2ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
@@ -474,7 +505,7 @@ index 98ed682fe..5dd6e907c 100644
return()
endif()
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
-index c7229dbb8..72531f3fc 100644
+index ba7c733be..61a9eeb91 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -18,7 +18,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizer
@@ -486,71 +517,8 @@ index c7229dbb8..72531f3fc 100644
@dataclass
-diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
-index c597fb106..5bad6645b 100644
---- a/benchmarks/benchmark_serving.py
-+++ b/benchmarks/benchmark_serving.py
-@@ -256,10 +256,11 @@ async def benchmark(
- raise ValueError(f"Unknown backend: {backend}")
-
- print("Starting initial single prompt test run...")
-+ # set test_output_len=10 to avoid long prompt test run
- test_prompt, test_prompt_len, test_output_len, test_mm_content = (
- input_requests[0].prompt,
- input_requests[0].prompt_len,
-- input_requests[0].expected_output_len,
-+ 10,
- input_requests[0].multi_modal_data,
- )
-
-diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
-index 14461121f..e9b9f0b77 100644
---- a/benchmarks/benchmark_throughput.py
-+++ b/benchmarks/benchmark_throughput.py
-@@ -44,6 +44,7 @@ def run_vllm(
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-+ do_profile: bool = False,
- ) -> tuple[float, Optional[list[RequestOutput]]]:
- from vllm import LLM, SamplingParams
-
-@@ -89,10 +90,14 @@ def run_vllm(
- outputs = None
- if not use_beam_search:
- start = time.perf_counter()
-+ if do_profile:
-+ llm.start_profile()
- outputs = llm.generate(
- prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
- )
- end = time.perf_counter()
-+ if do_profile:
-+ llm.stop_profile()
- else:
- assert lora_requests is None, "BeamSearch API does not support LoRA"
- prompts = [request.prompt for request in requests]
-@@ -410,6 +415,7 @@ def main(args: argparse.Namespace):
- args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize,
-+ args.profile
- )
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
-@@ -647,6 +653,10 @@ def create_argument_parser():
- parser.add_argument(
- "--num-prompts", type=int, default=1000, help="Number of prompts to process."
- )
-+ parser.add_argument("--profile",
-+ action='store_true',
-+ default=False,
-+ help="whether run with profiler.")
- parser.add_argument(
- "--hf-max-batch-size",
- type=int,
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
-index 621179a70..9e1f4e9c7 100644
+index 9c0ed1d09..a21fb37f1 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -445,7 +445,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
@@ -562,7 +530,7 @@ index 621179a70..9e1f4e9c7 100644
# Add hipify preprocessing step when building with HIP/ROCm.
if (GPU_LANGUAGE STREQUAL "HIP")
-@@ -487,6 +487,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+@@ -491,6 +491,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
@@ -7954,76 +7922,41 @@ index 000000000..23f5b805c
+
+#endif
\ No newline at end of file
+diff --git a/docker/Dockerfile b/docker/Dockerfile
+index d4761e84f..307e9658f 100644
+--- a/docker/Dockerfile
++++ b/docker/Dockerfile
+@@ -196,7 +196,6 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+ # Flag to control whether to use pre-built vLLM wheels
+ ARG VLLM_USE_PRECOMPILED=""
+-ARG VLLM_MAIN_CUDA_VERSION=""
+
+ # if USE_SCCACHE is set, use sccache to speed up compilation
+ RUN --mount=type=cache,target=/root/.cache/uv \
+@@ -214,7 +213,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
+ && export SCCACHE_IDLE_TIMEOUT=0 \
+ && export CMAKE_BUILD_TYPE=Release \
+ && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+- && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
+ && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+ && sccache --show-stats \
+ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
-index 7d5a589eb..25a9fd7cd 100644
+index ef4223525..ffa7c6ea7 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
-@@ -1,9 +1,10 @@
--# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
--FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
-+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
-
--RUN rm /etc/apt/sources.list.d/intel-graphics.list
-+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-+ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-+ add-apt-repository -y ppa:kobuk-team/intel-graphics
-
--RUN apt-get update -y && \
-+RUN apt clean && apt-get update -y && \
- apt-get install -y --no-install-recommends --fix-missing \
- curl \
- ffmpeg \
-@@ -14,15 +15,29 @@ RUN apt-get update -y && \
- libgl1 \
- lsb-release \
- numactl \
-- python3 \
-- python3-dev \
-- python3-pip \
-- wget
-+ wget \
-+ vim \
-+ python3.12 \
-+ python3.12-dev \
-+ python3-pip
-+
-+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
-+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
-+
-+RUN apt install -y libze1=1.23.1-1~24.04~ppa1 libze-dev=1.23.1-1~24.04~ppa1 libze-intel-gpu1=25.27.34303.9-1~24.04~ppa1 intel-opencl-icd=25.27.34303.9-1~24.04~ppa1 libze-intel-gpu-raytracing=1.1.0-114~u24.04
-+
-+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
-+SHELL ["bash", "-c"]
-+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
-
- WORKDIR /workspace/vllm
- COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
- COPY requirements/common.txt /workspace/vllm/requirements/common.txt
-
-+# suppress the python externally managed environment error
-+RUN python3 -m pip config set global.break-system-packages true
-+
- RUN --mount=type=cache,target=/root/.cache/pip \
- pip install --no-cache-dir \
- -r requirements/xpu.txt
-@@ -47,10 +62,11 @@ FROM vllm-base AS vllm-openai
+@@ -62,7 +62,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
+ pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] 'modelscope!=1.15.0'
-+
-+RUN --mount=type=cache,target=/root/.cache/pip \
-+ pip uninstall oneccl oneccl-devel -y
--ENV VLLM_USAGE_SOURCE production-docker-image \
-- TRITON_XPU_PROFILE 1
- # install development dependencies (for testing)
- RUN python3 -m pip install -e tests/vllm_test_utils
- ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ RUN --mount=type=cache,target=/root/.cache/pip \
+ pip uninstall oneccl oneccl-devel -y
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
-index 0661933ac..469d88a05 100644
+index 834c03cbe..439e1e0d7 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -134,4 +134,4 @@ print(result[0].outputs[0].text)
@@ -8033,210 +7966,173 @@ index 0661933ac..469d88a05 100644
- Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
+ Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
-index c8b6c6c86..404045306 100644
+index db3dd2c25..7d3577b14 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
-@@ -592,7 +592,8 @@ Specified using `--task generate`.
- | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
- | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
- | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
--| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
- | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
- | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-@@ -602,7 +603,7 @@ Specified using `--task generate`.
- | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
- | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
- | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
--| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
-+| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
- | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
- | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
- | `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
-@@ -646,6 +647,15 @@ Specified using `--task generate`.
+@@ -340,6 +340,7 @@ th {
+ | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
++| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
+ | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
+ | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+@@ -667,6 +668,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
+ | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+ | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
++| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ |
+ | `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ |
+ | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
+ | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+@@ -757,8 +761,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
+ Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
- This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-+!!! note
-+ `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
-+ MobileNet-v5 vision backbone.
-+
-+ Performance is not yet fully optimized mainly due to:
-+
-+ - Both audio and vision MM encoders use `transformers.AutoModel` implementation.
-+ - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
-+
!!! note
- Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+- For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+- is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
++ For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
-diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py
+ #### Transcription
+
+diff --git a/examples/bmg/reasoning.py b/examples/bmg/reasoning.py
new file mode 100644
-index 000000000..aec3481d2
+index 000000000..04f91786e
--- /dev/null
-+++ b/examples/offline_inference/basic/reward.py
-@@ -0,0 +1,55 @@
-+# SPDX-License-Identifier: Apache-2.0
-+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
++++ b/examples/bmg/reasoning.py
+@@ -0,0 +1,27 @@
++from openai import OpenAI
+
-+from argparse import Namespace
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://0.0.0.0:8000/v1"
+
-+from vllm import LLM, EngineArgs
-+from vllm.utils import FlexibleArgumentParser
++client = OpenAI(
++ api_key=openai_api_key,
++ base_url=openai_api_base,
++)
+
++models = client.models.list()
++model = models.data[0].id
+
-+def parse_args():
-+ parser = FlexibleArgumentParser()
-+ parser = EngineArgs.add_cli_args(parser)
-+ # Set example specific arguments
-+ parser.set_defaults(
-+ model="internlm/internlm2-1_8b-reward",
-+ #runner="pooling",
-+ task="reward",
-+ enforce_eager=True,
-+ max_model_len=1024,
-+ trust_remote_code=True,
-+ )
-+ return parser.parse_args()
++# Round 1
++messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
++# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
++# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
++# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
++response = client.chat.completions.create(model=model, messages=messages)
+
++reasoning_content = response.choices[0].message.reasoning_content
++content = response.choices[0].message.content
+
-+def main(args: Namespace):
-+ # Sample prompts.
-+ prompts = [
-+ "Hello, my name is",
-+ "The president of the United States is",
-+ "The capital of France is",
-+ "The future of AI is",
-+ ]
++print("reasoning_content:", reasoning_content)
++print("content:", content)
+
-+ # Create an LLM.
-+ # You should pass runner="pooling" for reward models
-+ llm = LLM(**vars(args))
+diff --git a/examples/bmg/tooling.py b/examples/bmg/tooling.py
+new file mode 100644
+index 000000000..bf8375831
+--- /dev/null
++++ b/examples/bmg/tooling.py
+@@ -0,0 +1,37 @@
++import json
+
-+ # Generate rewards. The output is a list of PoolingRequestOutput.
-+ outputs = llm.reward(prompts)
++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
++
++def get_weather(location: str, unit: str):
++ return f"Getting the weather for {location} in {unit}..."
++tool_functions = {"get_weather": get_weather}
++
++tools = [{
++ "type": "function",
++ "function": {
++ "name": "get_weather",
++ "description": "Get the current weather in a given location",
++ "parameters": {
++ "type": "object",
++ "properties": {
++ "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
++ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
++ },
++ "required": ["location", "unit"]
++ }
++ }
++}]
++
++response = client.chat.completions.create(
++ model=client.models.list().data[0].id,
++ messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
++ tools=tools,
++ temperature=0,
++ tool_choice="auto"
++)
+
-+ # Print the outputs.
-+ print("\nGenerated Outputs:\n" + "-" * 60)
-+ for prompt, output in zip(prompts, outputs):
-+ rewards = output.outputs.data
-+ rewards_trimmed = (
-+ (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
-+ )
-+ print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
-+ print("-" * 60)
-+
-+
-+if __name__ == "__main__":
-+ args = parse_args()
-+ main(args)
-+
-diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
-index f0c00bcaa..c8fa36295 100644
---- a/examples/offline_inference/multilora_inference.py
-+++ b/examples/offline_inference/multilora_inference.py
-@@ -30,7 +30,7 @@ def create_test_prompts(
- (
- "A robot may not injure a human being",
- SamplingParams(
-- temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-+ temperature=0.0, logprobs=1, max_tokens=128
- ),
- None,
- ),
-@@ -46,7 +46,7 @@ def create_test_prompts(
- SamplingParams(
- temperature=0.0,
- logprobs=1,
-- prompt_logprobs=1,
-+ #prompt_logprobs=1,
- max_tokens=128,
- stop_token_ids=[32003],
- ),
-@@ -57,7 +57,7 @@ def create_test_prompts(
- SamplingParams(
- temperature=0.0,
- logprobs=1,
-- prompt_logprobs=1,
-+ #prompt_logprobs=1,
- max_tokens=128,
- stop_token_ids=[32003],
- ),
-@@ -99,14 +99,14 @@ def initialize_engine() -> LLMEngine:
- # numbers will cause higher memory usage. If you know that all LoRAs will
- # use the same rank, it is recommended to set this as low as possible.
- # max_cpu_loras: controls the size of the CPU LoRA cache.
-- engine_args = EngineArgs(
-- model="meta-llama/Llama-2-7b-hf",
-- enable_lora=True,
-- max_loras=1,
-- max_lora_rank=8,
-- max_cpu_loras=2,
-- max_num_seqs=256,
-- )
-+ engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
-+ enable_lora=True,
-+ max_loras=1,
-+ max_lora_rank=8,
-+ max_cpu_loras=2,
-+ max_num_seqs=256,
-+ enforce_eager=True,
-+ block_size=64)
- return LLMEngine.from_engine_args(engine_args)
-
-
-diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
-index 4fdc7a3cf..b6007b9f4 100644
---- a/examples/offline_inference/prithvi_geospatial_mae.py
-+++ b/examples/offline_inference/prithvi_geospatial_mae.py
-@@ -3,12 +3,12 @@
- import argparse
- import datetime
- import os
--import re
- from typing import Union
-
- import albumentations
- import numpy as np
- import rasterio
-+import regex as re
- import torch
- from einops import rearrange
- from terratorch.datamodules import Sen1Floods11NonGeoDataModule
++tool_call = response.choices[0].message.tool_calls[0].function
++print(f"Function called: {tool_call.name}")
++print(f"Arguments: {tool_call.arguments}")
++print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}") 30,22 Bot
++
+diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
+index 36d805a32..2a4233b6a 100644
+--- a/examples/offline_inference/data_parallel.py
++++ b/examples/offline_inference/data_parallel.py
+@@ -96,6 +96,13 @@ def parse_args():
+ "--quantization",
+ type=str,
+ )
++ parser.add_argument(
++ "--disable-expert-parallel",
++ dest="enable_expert_parallel",
++ action="store_false",
++ help="Disable expert parallel (default: enabled).",
++ )
++ parser.set_defaults(enable_expert_parallel=True)
+ return parser.parse_args()
+
+
+@@ -108,6 +115,7 @@ def main(
+ dp_master_port,
+ GPUs_per_dp_rank,
+ enforce_eager,
++ enable_expert_parallel,
+ trust_remote_code,
+ max_num_seqs,
+ max_model_len,
+@@ -162,7 +170,7 @@ def main(
+ model=model,
+ tensor_parallel_size=GPUs_per_dp_rank,
+ enforce_eager=enforce_eager,
+- enable_expert_parallel=True,
++ enable_expert_parallel=enable_expert_parallel,
+ trust_remote_code=trust_remote_code,
+ max_num_seqs=max_num_seqs,
+ max_model_len=max_model_len,
+@@ -222,6 +230,7 @@ if __name__ == "__main__":
+ dp_master_port,
+ tp_size,
+ args.enforce_eager,
++ args.enable_expert_parallel,
+ args.trust_remote_code,
+ args.max_num_seqs,
+ args.max_model_len,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
-index e4811c023..fe4393bcf 100644
+index b104113b8..58fb423e8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
-@@ -389,6 +389,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+@@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
)
-+# Intern-S1
-+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-+ model_name = "internlm/Intern-S1"
++# Dots-OCR
++def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
++ assert modality == "image"
+
++ prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+ engine_args = EngineArgs(
-+ model=model_name,
-+ trust_remote_code=True,
-+ max_model_len=8192,
-+ max_num_seqs=2,
++ model="rednote-hilab/dots.ocr",
+ limit_mm_per_prompt={modality: 1},
-+ enforce_eager=True,
-+ )
-+
-+ if modality == "image":
-+ placeholder = ""
-+ elif modality == "video":
-+ placeholder = "