diff --git a/vllm/Miner-U/README.md b/vllm/Miner-U/README.md index 79e2c86..d79bb03 100644 --- a/vllm/Miner-U/README.md +++ b/vllm/Miner-U/README.md @@ -53,3 +53,5 @@ mineru-gradio --server-name 0.0.0.0 --server-port 7860 ``` Refer to [here](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#_2) for more details. + +### Refer to [here](https://github.com/intel/llm-scaler/tree/main/vllm#243-mineru-26-support) for new version 2.6.1 of mineru-vllm, which has performance improvements. diff --git a/vllm/README.md b/vllm/README.md index f81ab0c..36e81d2 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -2278,16 +2278,9 @@ TORCH_LLM_ALLREDUCE=1 VLLM_USE_V1=1 CCL_ZE_IPC_EXCHANGE=pidfd VLLM_ALLOW_LONG_M --- -### 2.4.3 MinerU 2.5 Support +### 2.4.3 MinerU 2.6 Support -This guide shows how to launch the MinerU 2.5 model using the vLLM inference backend. - -#### Install MinerU Core - -First, install the core MinerU package: -```bash -pip install mineru[core] -``` +This guide shows how to launch the MinerU 2.6 model using the vLLM inference backend. #### Start the MinerU Service @@ -2307,7 +2300,10 @@ python3 -m vllm.entrypoints.openai.api_server \ --trust-remote-code \ --gpu-memory-util 0.85 \ --no-enable-prefix-caching \ + --max-num-batched-tokens=32768 \ + --max-model-len=32768 \ --block-size 64 \ + --max-num-seqs 256 \ --served-model-name MinerU \ --tensor-parallel-size 1 \ --pipeline-parallel-size 1 \ @@ -2328,6 +2324,31 @@ To verify mineru mineru -p /llm/MinerU/demo/pdfs/small_ocr.pdf -o ./ -b vlm-http-client -u http://127.0.0.1:8000 ``` +2.Using by gradio + +```bash +mineru-gradio --server-name 0.0.0.0 --server-port 8002 +``` + +```python +from gradio_client import Client, handle_file + +client = Client("http://localhost:8002/") +result = client.predict( + file_path=handle_file('/llm/MinerU/demo/pdfs/small_ocr.pdf'), + end_pages=500, + is_ocr=False, + formula_enable=True, + table_enable=True, + language="ch", + backend="vlm-http-client", + url="http://localhost:8000", + api_name="/to_markdown" +) +print(result) +``` +More details you can refer to gradio's [api guide](http://your_ip:8002/?view=api) + --- ### 2.5 Omni Model Support @@ -2362,7 +2383,7 @@ python3 -m vllm.entrypoints.openai.api_server \ After starting the vLLM service, you can follow this link to use it -#### [Qwen2.5-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage) +#### [Qwen-Omni input](https://github.com/QwenLM/Qwen2.5-Omni?tab=readme-ov-file#vllm-serve-usage) ```bash curl http://localhost:8000/v1/chat/completions \ @@ -2383,6 +2404,25 @@ An example responce is listed below: ```json {"id":"chatcmpl-xxx","object":"chat.completion","model":"Qwen2.5-Omni-7B","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the image is \"TONGYI Qwen\". The sound in the audio is a cough.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":156,"total_tokens":180,"completion_tokens":24,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_transfer_params":null} ``` + +For video input, one can input like this: + +```bash +curl -sS http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen3-Omni-30B-A3B-Instruct", + "temperature": 0, + "max_tokens": 1024, + "messages": [{ + "role": "user", + "content": [ + { "type": "text", "text": "Please describe the video comprehensively as much as possible." }, + { "type": "video_url", "video_url": { "url": "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" } } + ] + }] + }' +``` + + --- ### 2.6 Data Parallelism (DP) diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile index 2ba001c..2018d05 100644 --- a/vllm/docker/Dockerfile +++ b/vllm/docker/Dockerfile @@ -16,7 +16,6 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO RUN apt-get update -y && \ # apt-get install -y software-properties-common && \ # add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update -y && \ apt-get install -y python3.12 python3.12-dev python3-pip && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \ @@ -34,7 +33,6 @@ RUN apt-get update -y && \ vim \ linux-libc-dev && \ # Install Intel GPU runtime packages - apt-get update -y && \ apt-get install -y intel-oneapi-dpcpp-ct=2025.1.0-452 && \ apt-get clean && rm -rf /var/lib/apt/lists/* @@ -51,7 +49,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn RUN python3 -m pip config set global.break-system-packages true # Clone + patch vllm -RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \ +RUN git clone -b v0.10.2 https://github.com/vllm-project/vllm.git && \ cd vllm && \ git apply /tmp/vllm_for_multi_arc.patch && \ pip install --no-cache-dir -r requirements/xpu.txt && \ @@ -59,12 +57,12 @@ RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \ python3 setup.py install # Clone + patch miner-U -RUN git clone https://github.com/opendatalab/MinerU.git && \ +RUN git clone -b release-2.6.2 https://github.com/opendatalab/MinerU.git && \ cd MinerU && \ - git checkout de41fa58590263e43b783fe224b6d07cae290a33 && \ - git apply /tmp/miner-u.patch && \ - pip install -e .[core] && \ - sed -i 's/select_device(self.args.device, verbose=verbose)/torch.device(self.args.device)/' /usr/local/lib/python3.12/dist-packages/ultralytics/engine/predictor.py + pip install -e .[core] --no-deps && \ + pip install mineru_vl_utils==0.1.14 gradio gradio-client gradio-pdf && \ + sed -i 's/kwargs.get("max_concurrency", 100)/kwargs.get("max_concurrency", 200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py && \ + sed -i 's/kwargs.get("http_timeout", 600)/kwargs.get("http_timeout", 1200)/' /llm/MinerU/mineru/backend/vlm/vlm_analyze.py # ======= Add oneCCL build ======= @@ -120,7 +118,9 @@ RUN pip install accelerate hf_transfer 'modelscope!=1.15.0' # Pin transformers version to avoid conflict in vLLM -RUN pip install "transformers<4.54.0" +RUN pip install "transformers==4.57.0" && \ + pip install librosa soundfile && \ + pip install mineru[core]==2.5.4 # Set additional environment for production usage diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch index e961e2a..c35c5b4 100644 --- a/vllm/patches/vllm_for_multi_arc.patch +++ b/vllm/patches/vllm_for_multi_arc.patch @@ -1,5 +1,5 @@ diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -index b98d42aa7..b2a1ebef2 100644 +index 792f355c4..af2c24c4c 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do @@ -10,6 +10,107 @@ index b98d42aa7..b2a1ebef2 100644 + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,enforce_eager=true,max_num_batched_tokens=4096" \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ --batch_size "$BATCH_SIZE" +diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml +index 8c6ef7817..a1de41652 100644 +--- a/.buildkite/release-pipeline.yaml ++++ b/.buildkite/release-pipeline.yaml +@@ -1,22 +1,24 @@ + steps: + # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build arm64 wheel - CUDA 12.9" +- depends_on: ~ + id: build-wheel-arm64-cuda-12-9 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 +- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." ++ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + ++ - block: "Build CUDA 12.8 wheel" ++ key: block-build-cu128-wheel ++ + - label: "Build wheel - CUDA 12.8" +- depends_on: ~ ++ depends_on: block-build-cu128-wheel + id: build-wheel-cuda-12-8 + agents: + queue: cpu_queue_postmerge +@@ -28,8 +30,12 @@ steps: + env: + DOCKER_BUILDKIT: "1" + +- - label: "Build wheel - CUDA 12.6" ++ - block: "Build CUDA 12.6 wheel" ++ key: block-build-cu126-wheel + depends_on: ~ ++ ++ - label: "Build wheel - CUDA 12.6" ++ depends_on: block-build-cu126-wheel + id: build-wheel-cuda-12-6 + agents: + queue: cpu_queue_postmerge +@@ -96,6 +102,8 @@ steps: + depends_on: + - create-multi-arch-manifest + - build-wheel-cuda-12-8 ++ - build-wheel-cuda-12-6 ++ - build-wheel-cuda-12-9 + id: annotate-release-workflow + agents: + queue: cpu_queue_postmerge +diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh +index fde48603a..94e0ac239 100755 +--- a/.buildkite/scripts/annotate-release.sh ++++ b/.buildkite/scripts/annotate-release.sh +@@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF + To download the wheel: + \`\`\` + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . +- + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . +-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . ++aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . + \`\`\` + + To download and upload the image: + + \`\`\` +-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 +-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 +- +-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 +-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 +-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +-docker push vllm/vllm-openai:latest-x86_64 +-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +- +-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 +-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 +-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +-docker push vllm/vllm-openai:latest-aarch64 +-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +- +-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend +-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +-docker manifest push vllm/vllm-openai:latest +-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} ++docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} ++docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai ++docker tag vllm/vllm-openai vllm/vllm-openai:latest ++docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} ++docker push vllm/vllm-openai:latest ++docker push vllm/vllm-openai:v${RELEASE_VERSION} + \`\`\` + EOF +\ No newline at end of file diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..aef250abe @@ -388,78 +489,8 @@ index 000000000..eaa2f332a + else + echo "✅ All benchmarks passed" + fi -diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml -deleted file mode 100644 -index d5c6b8d43..000000000 ---- a/.github/workflows/cleanup_pr_body.yml -+++ /dev/null -@@ -1,31 +0,0 @@ --name: Cleanup PR Body -- --on: -- pull_request_target: -- types: [opened, reopened, edited] -- --permissions: -- pull-requests: write -- --jobs: -- update-description: -- runs-on: ubuntu-latest -- -- steps: -- - name: Checkout repository -- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 -- -- - name: Set up Python -- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 -- with: -- python-version: '3.12' -- -- - name: Install Python dependencies -- run: | -- python3 -m pip install --upgrade pip -- python3 -m pip install regex -- -- - name: Update PR description -- env: -- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -- run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" -diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml -deleted file mode 100644 -index 16ae1aadb..000000000 ---- a/.github/workflows/reminder_comment.yml -+++ /dev/null -@@ -1,27 +0,0 @@ --name: PR Reminder Comment Bot --permissions: -- pull-requests: write --on: -- pull_request_target: -- types: [opened] --jobs: -- pr_reminder: -- runs-on: ubuntu-latest -- steps: -- - name: Remind to run full CI on PR -- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 -- with: -- script: | -- github.rest.issues.createComment({ -- owner: context.repo.owner, -- repo: context.repo.repo, -- issue_number: context.issue.number, -- body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + -- '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + -- 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' + -- 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + -- 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + -- '🚀' -- }) -- env: -- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CMakeLists.txt b/CMakeLists.txt -index 98ed682fe..5dd6e907c 100644 +index 3f1f9a781..fef10e2ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND @@ -474,7 +505,7 @@ index 98ed682fe..5dd6e907c 100644 return() endif() diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py -index c7229dbb8..72531f3fc 100644 +index ba7c733be..61a9eeb91 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -18,7 +18,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizer @@ -486,71 +517,8 @@ index c7229dbb8..72531f3fc 100644 @dataclass -diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py -index c597fb106..5bad6645b 100644 ---- a/benchmarks/benchmark_serving.py -+++ b/benchmarks/benchmark_serving.py -@@ -256,10 +256,11 @@ async def benchmark( - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") -+ # set test_output_len=10 to avoid long prompt test run - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0].prompt, - input_requests[0].prompt_len, -- input_requests[0].expected_output_len, -+ 10, - input_requests[0].multi_modal_data, - ) - -diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py -index 14461121f..e9b9f0b77 100644 ---- a/benchmarks/benchmark_throughput.py -+++ b/benchmarks/benchmark_throughput.py -@@ -44,6 +44,7 @@ def run_vllm( - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -+ do_profile: bool = False, - ) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - -@@ -89,10 +90,14 @@ def run_vllm( - outputs = None - if not use_beam_search: - start = time.perf_counter() -+ if do_profile: -+ llm.start_profile() - outputs = llm.generate( - prompts, sampling_params, lora_request=lora_requests, use_tqdm=True - ) - end = time.perf_counter() -+ if do_profile: -+ llm.stop_profile() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - prompts = [request.prompt for request in requests] -@@ -410,6 +415,7 @@ def main(args: argparse.Namespace): - args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize, -+ args.profile - ) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 -@@ -647,6 +653,10 @@ def create_argument_parser(): - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) -+ parser.add_argument("--profile", -+ action='store_true', -+ default=False, -+ help="whether run with profiler.") - parser.add_argument( - "--hf-max-batch-size", - type=int, diff --git a/cmake/utils.cmake b/cmake/utils.cmake -index 621179a70..9e1f4e9c7 100644 +index 9c0ed1d09..a21fb37f1 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -445,7 +445,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) @@ -562,7 +530,7 @@ index 621179a70..9e1f4e9c7 100644 # Add hipify preprocessing step when building with HIP/ROCm. if (GPU_LANGUAGE STREQUAL "HIP") -@@ -487,6 +487,11 @@ function (define_gpu_extension_target GPU_MOD_NAME) +@@ -491,6 +491,11 @@ function (define_gpu_extension_target GPU_MOD_NAME) target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) @@ -7954,76 +7922,41 @@ index 000000000..23f5b805c + +#endif \ No newline at end of file +diff --git a/docker/Dockerfile b/docker/Dockerfile +index d4761e84f..307e9658f 100644 +--- a/docker/Dockerfile ++++ b/docker/Dockerfile +@@ -196,7 +196,6 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 + + # Flag to control whether to use pre-built vLLM wheels + ARG VLLM_USE_PRECOMPILED="" +-ARG VLLM_MAIN_CUDA_VERSION="" + + # if USE_SCCACHE is set, use sccache to speed up compilation + RUN --mount=type=cache,target=/root/.cache/uv \ +@@ -214,7 +213,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ + && export SCCACHE_IDLE_TIMEOUT=0 \ + && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ +- && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ + && sccache --show-stats \ + && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu -index 7d5a589eb..25a9fd7cd 100644 +index ef4223525..ffa7c6ea7 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu -@@ -1,9 +1,10 @@ --# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. --FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base -+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base - --RUN rm /etc/apt/sources.list.d/intel-graphics.list -+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ -+ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ -+ add-apt-repository -y ppa:kobuk-team/intel-graphics - --RUN apt-get update -y && \ -+RUN apt clean && apt-get update -y && \ - apt-get install -y --no-install-recommends --fix-missing \ - curl \ - ffmpeg \ -@@ -14,15 +15,29 @@ RUN apt-get update -y && \ - libgl1 \ - lsb-release \ - numactl \ -- python3 \ -- python3-dev \ -- python3-pip \ -- wget -+ wget \ -+ vim \ -+ python3.12 \ -+ python3.12-dev \ -+ python3-pip -+ -+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 -+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 -+ -+RUN apt install -y libze1=1.23.1-1~24.04~ppa1 libze-dev=1.23.1-1~24.04~ppa1 libze-intel-gpu1=25.27.34303.9-1~24.04~ppa1 intel-opencl-icd=25.27.34303.9-1~24.04~ppa1 libze-intel-gpu-raytracing=1.1.0-114~u24.04 -+ -+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh -+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc -+SHELL ["bash", "-c"] -+CMD ["bash", "-c", "source /root/.bashrc && exec bash"] - - WORKDIR /workspace/vllm - COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt - COPY requirements/common.txt /workspace/vllm/requirements/common.txt - -+# suppress the python externally managed environment error -+RUN python3 -m pip config set global.break-system-packages true -+ - RUN --mount=type=cache,target=/root/.cache/pip \ - pip install --no-cache-dir \ - -r requirements/xpu.txt -@@ -47,10 +62,11 @@ FROM vllm-base AS vllm-openai +@@ -62,7 +62,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope + pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] 'modelscope!=1.15.0' -+ -+RUN --mount=type=cache,target=/root/.cache/pip \ -+ pip uninstall oneccl oneccl-devel -y --ENV VLLM_USAGE_SOURCE production-docker-image \ -- TRITON_XPU_PROFILE 1 - # install development dependencies (for testing) - RUN python3 -m pip install -e tests/vllm_test_utils - ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + RUN --mount=type=cache,target=/root/.cache/pip \ + pip uninstall oneccl oneccl-devel -y diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md -index 0661933ac..469d88a05 100644 +index 834c03cbe..439e1e0d7 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -134,4 +134,4 @@ print(result[0].outputs[0].text) @@ -8033,210 +7966,173 @@ index 0661933ac..469d88a05 100644 - Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. + Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md -index c8b6c6c86..404045306 100644 +index db3dd2c25..7d3577b14 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md -@@ -592,7 +592,8 @@ Specified using `--task generate`. - | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | - | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | - | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | --| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -+| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | -+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | - | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | - | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | - | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | -@@ -602,7 +603,7 @@ Specified using `--task generate`. - | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | - | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | - | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | --| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | -+| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | - | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | - | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | - | `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | -@@ -646,6 +647,15 @@ Specified using `--task generate`. +@@ -340,6 +340,7 @@ th { + | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | + | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | + | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | ++| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ | + | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | + | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | + | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +@@ -667,6 +668,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen + | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | + | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | + | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ | ++| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | ++| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | ++| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ | + | `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | + | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | + | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | +@@ -757,8 +761,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th + Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. - This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. - -+!!! note -+ `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its -+ MobileNet-v5 vision backbone. -+ -+ Performance is not yet fully optimized mainly due to: -+ -+ - Both audio and vision MM encoders use `transformers.AutoModel` implementation. -+ - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups. -+ !!! note - Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. +- For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) +- is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. ++ For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported. -diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py + #### Transcription + +diff --git a/examples/bmg/reasoning.py b/examples/bmg/reasoning.py new file mode 100644 -index 000000000..aec3481d2 +index 000000000..04f91786e --- /dev/null -+++ b/examples/offline_inference/basic/reward.py -@@ -0,0 +1,55 @@ -+# SPDX-License-Identifier: Apache-2.0 -+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ++++ b/examples/bmg/reasoning.py +@@ -0,0 +1,27 @@ ++from openai import OpenAI + -+from argparse import Namespace ++# Modify OpenAI's API key and API base to use vLLM's API server. ++openai_api_key = "EMPTY" ++openai_api_base = "http://0.0.0.0:8000/v1" + -+from vllm import LLM, EngineArgs -+from vllm.utils import FlexibleArgumentParser ++client = OpenAI( ++ api_key=openai_api_key, ++ base_url=openai_api_base, ++) + ++models = client.models.list() ++model = models.data[0].id + -+def parse_args(): -+ parser = FlexibleArgumentParser() -+ parser = EngineArgs.add_cli_args(parser) -+ # Set example specific arguments -+ parser.set_defaults( -+ model="internlm/internlm2-1_8b-reward", -+ #runner="pooling", -+ task="reward", -+ enforce_eager=True, -+ max_model_len=1024, -+ trust_remote_code=True, -+ ) -+ return parser.parse_args() ++# Round 1 ++messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] ++# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` ++# For Qwen3 series, if you want to disable thinking in reasoning mode, add: ++# extra_body={"chat_template_kwargs": {"enable_thinking": False}} ++response = client.chat.completions.create(model=model, messages=messages) + ++reasoning_content = response.choices[0].message.reasoning_content ++content = response.choices[0].message.content + -+def main(args: Namespace): -+ # Sample prompts. -+ prompts = [ -+ "Hello, my name is", -+ "The president of the United States is", -+ "The capital of France is", -+ "The future of AI is", -+ ] ++print("reasoning_content:", reasoning_content) ++print("content:", content) + -+ # Create an LLM. -+ # You should pass runner="pooling" for reward models -+ llm = LLM(**vars(args)) +diff --git a/examples/bmg/tooling.py b/examples/bmg/tooling.py +new file mode 100644 +index 000000000..bf8375831 +--- /dev/null ++++ b/examples/bmg/tooling.py +@@ -0,0 +1,37 @@ ++import json + -+ # Generate rewards. The output is a list of PoolingRequestOutput. -+ outputs = llm.reward(prompts) ++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") ++ ++def get_weather(location: str, unit: str): ++ return f"Getting the weather for {location} in {unit}..." ++tool_functions = {"get_weather": get_weather} ++ ++tools = [{ ++ "type": "function", ++ "function": { ++ "name": "get_weather", ++ "description": "Get the current weather in a given location", ++ "parameters": { ++ "type": "object", ++ "properties": { ++ "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, ++ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} ++ }, ++ "required": ["location", "unit"] ++ } ++ } ++}] ++ ++response = client.chat.completions.create( ++ model=client.models.list().data[0].id, ++ messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], ++ tools=tools, ++ temperature=0, ++ tool_choice="auto" ++) + -+ # Print the outputs. -+ print("\nGenerated Outputs:\n" + "-" * 60) -+ for prompt, output in zip(prompts, outputs): -+ rewards = output.outputs.data -+ rewards_trimmed = ( -+ (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards -+ ) -+ print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})") -+ print("-" * 60) -+ -+ -+if __name__ == "__main__": -+ args = parse_args() -+ main(args) -+ -diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py -index f0c00bcaa..c8fa36295 100644 ---- a/examples/offline_inference/multilora_inference.py -+++ b/examples/offline_inference/multilora_inference.py -@@ -30,7 +30,7 @@ def create_test_prompts( - ( - "A robot may not injure a human being", - SamplingParams( -- temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 -+ temperature=0.0, logprobs=1, max_tokens=128 - ), - None, - ), -@@ -46,7 +46,7 @@ def create_test_prompts( - SamplingParams( - temperature=0.0, - logprobs=1, -- prompt_logprobs=1, -+ #prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003], - ), -@@ -57,7 +57,7 @@ def create_test_prompts( - SamplingParams( - temperature=0.0, - logprobs=1, -- prompt_logprobs=1, -+ #prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003], - ), -@@ -99,14 +99,14 @@ def initialize_engine() -> LLMEngine: - # numbers will cause higher memory usage. If you know that all LoRAs will - # use the same rank, it is recommended to set this as low as possible. - # max_cpu_loras: controls the size of the CPU LoRA cache. -- engine_args = EngineArgs( -- model="meta-llama/Llama-2-7b-hf", -- enable_lora=True, -- max_loras=1, -- max_lora_rank=8, -- max_cpu_loras=2, -- max_num_seqs=256, -- ) -+ engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", -+ enable_lora=True, -+ max_loras=1, -+ max_lora_rank=8, -+ max_cpu_loras=2, -+ max_num_seqs=256, -+ enforce_eager=True, -+ block_size=64) - return LLMEngine.from_engine_args(engine_args) - - -diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py -index 4fdc7a3cf..b6007b9f4 100644 ---- a/examples/offline_inference/prithvi_geospatial_mae.py -+++ b/examples/offline_inference/prithvi_geospatial_mae.py -@@ -3,12 +3,12 @@ - import argparse - import datetime - import os --import re - from typing import Union - - import albumentations - import numpy as np - import rasterio -+import regex as re - import torch - from einops import rearrange - from terratorch.datamodules import Sen1Floods11NonGeoDataModule ++tool_call = response.choices[0].message.tool_calls[0].function ++print(f"Function called: {tool_call.name}") ++print(f"Arguments: {tool_call.arguments}") ++print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}") 30,22 Bot ++ +diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py +index 36d805a32..2a4233b6a 100644 +--- a/examples/offline_inference/data_parallel.py ++++ b/examples/offline_inference/data_parallel.py +@@ -96,6 +96,13 @@ def parse_args(): + "--quantization", + type=str, + ) ++ parser.add_argument( ++ "--disable-expert-parallel", ++ dest="enable_expert_parallel", ++ action="store_false", ++ help="Disable expert parallel (default: enabled).", ++ ) ++ parser.set_defaults(enable_expert_parallel=True) + return parser.parse_args() + + +@@ -108,6 +115,7 @@ def main( + dp_master_port, + GPUs_per_dp_rank, + enforce_eager, ++ enable_expert_parallel, + trust_remote_code, + max_num_seqs, + max_model_len, +@@ -162,7 +170,7 @@ def main( + model=model, + tensor_parallel_size=GPUs_per_dp_rank, + enforce_eager=enforce_eager, +- enable_expert_parallel=True, ++ enable_expert_parallel=enable_expert_parallel, + trust_remote_code=trust_remote_code, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, +@@ -222,6 +230,7 @@ if __name__ == "__main__": + dp_master_port, + tp_size, + args.enforce_eager, ++ args.enable_expert_parallel, + args.trust_remote_code, + args.max_num_seqs, + args.max_model_len, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py -index e4811c023..fe4393bcf 100644 +index b104113b8..58fb423e8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py -@@ -389,6 +389,39 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: +@@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ) -+# Intern-S1 -+def run_interns1(questions: list[str], modality: str) -> ModelRequestData: -+ model_name = "internlm/Intern-S1" ++# Dots-OCR ++def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData: ++ assert modality == "image" + ++ prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions] + engine_args = EngineArgs( -+ model=model_name, -+ trust_remote_code=True, -+ max_model_len=8192, -+ max_num_seqs=2, ++ model="rednote-hilab/dots.ocr", + limit_mm_per_prompt={modality: 1}, -+ enforce_eager=True, -+ ) -+ -+ if modality == "image": -+ placeholder = "" -+ elif modality == "video": -+ placeholder = "