diff --git a/vllm/README.md b/vllm/README.md
index 8c80c85..93e4a6c 100644
--- a/vllm/README.md
+++ b/vllm/README.md
@@ -21,6 +21,7 @@ llm-scaler-vllm is an extended and optimized version of vLLM, specifically adapt
2.5 [Omni Model Support](#25-omni-model-support)
2.6 [Data Parallelism (DP)](#26-data-parallelism-dp)
2.7 [Finding maximum Context Length](#27-finding-maximum-context-length)
+ 2.8 [Multi-Modal Webui](#28-multi-modal-webui)
3. [Supported Models](#3-supported-models)
4. [Troubleshooting](#4-troubleshooting)
5. [Performance tuning](#5-performance-tuning)
@@ -2314,6 +2315,33 @@ In this case, you should adjust the launch command with:
--max-model-len 114432
```
+### 2.8 Multi-Modal Webui
+The project provides two optimized interfaces for interacting with Qwen2.5-VL models:
+
+
+#### 📌 Core Components
+- **Inference Engine**: vLLM (Intel-optimized)
+- **Interfaces**:
+ - Gradio (for rapid prototyping)
+ - ComfyUI (for complex workflows)
+
+#### 🚀 Deployment Options
+
+#### Option 1: Gradio Deployment (Recommended for Most Users)
+- check `/llm-scaler/vllm/webui/multi-modal-gradio/README.md` for implementation details
+
+#### Option 2: ComfyUI Deployment (Advanced Workflows)
+- check `/llm-scaler/vllm/webui/multi-modal-comfyui/README.md` for implementation details
+
+
+#### 🔧 Configuration Guide
+
+| Parameter | Effect | Recommended Value |
+|-----------|--------|-------------------|
+| `--quantization fp8` | XPU acceleration | Required |
+| `-tp=2` | Tensor parallelism | Match GPU count |
+| `--max-model-len` | Context window | 32768 (max) |
+
---
## 3. Supported Models
diff --git a/vllm/webui/multi-modal-comfyui/README.md b/vllm/webui/multi-modal-comfyui/README.md
new file mode 100644
index 0000000..63203a3
--- /dev/null
+++ b/vllm/webui/multi-modal-comfyui/README.md
@@ -0,0 +1,284 @@
+# Qwen2.5-VL-3B-Instruct Deployment Guide (ComfyUI + Intel GPU + Linux)
+
+This document provides comprehensive instructions for deploying the `Qwen2.5-VL-3.5B-Instruct` multimodal LLM on Linux systems with `Intel GPU` acceleration via `ComfyUI` workflow.
+
+## 🛠️ Installation Procedure
+### 1. Environment Setup
+```bash
+# Install system dependencies
+sudo apt update && sudo apt install -y \
+ git python3-pip python3-venv \
+ ocl-icd-opencl-dev
+
+# Configure Intel GPU drivers (if not present)
+sudo apt install -y \
+ intel-opencl-icd \
+ intel-level-zero-gpu \
+ level-zero
+```
+
+### 2. Conda Environment Configuration
+```bash
+conda create -n comfyqwen python=3.11
+conda activate comfyqwen
+```
+
+### 3. ComfyUI Installation
+```bash
+git clone https://github.com/comfyanonymous/ComfyUI.git
+cd ./ComfyUI
+
+# Install Intel-optimized PyTorch
+pip install torch torchvision torchaudio \
+ --index-url https://download.pytorch.org/whl/xpu
+
+# For nightly builds with potential performance improvements:
+# pip install --pre torch torchvision torchaudio \
+# --index-url https://download.pytorch.org/whl/nightly/xpu
+
+pip install -r requirements.txt
+```
+
+### 4. Qwen2.5-VL Custom Node Deployment
+```bash
+# Download node definition files
+git clone https://github.com/IuvenisSapiens/ComfyUI_Qwen2_5-VL-Instruct
+
+Move the ComfyUI_Qwen2_5-VL-Instruct folder into /ComfyUI/custom_nodes/ directory
+
+Place the downloaded Qwen2.5-VL-3B-Instruct model folder into /ComfyUI/models/prompt_generator/
+# If prompt_generator subdirectory doesn't exist under models, please create it first
+```
+ComfyUI_Qwen2_5-VL-Instruct_workflow.json
+{
+ "id": "9f2dfc63-3d19-433d-a7c0-49d83464f553",
+ "revision": 0,
+ "last_node_id": 59,
+ "last_link_id": 72,
+ "nodes": [
+ {
+ "id": 56,
+ "type": "Qwen2_VQA",
+ "pos": [
+ 199.93017578125,
+ 46.947696685791016
+ ],
+ "size": [
+ 322.1059265136719,
+ 348
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "source_path",
+ "shape": 7,
+ "type": "PATH",
+ "link": 70
+ },
+ {
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "slot_index": 0,
+ "links": [
+ 72
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Qwen2_VQA",
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "Describe the video in detail",
+ "Qwen2.5-VL-3B-Instruct",
+ "none",
+ false,
+ 0.7,
+ 2048,
+ 200704,
+ 1003520,
+ 1444,
+ "randomize",
+ "eager"
+ ]
+ },
+ {
+ "id": 59,
+ "type": "PreviewAny",
+ "pos": [
+ 702.7207641601562,
+ 61.4115104675293
+ ],
+ "size": [
+ 140,
+ 76
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "source",
+ "type": "*",
+ "link": 72
+ }
+ ],
+ "outputs": [],
+ "properties": {
+ "Node name for S&R": "PreviewAny"
+ },
+ "widgets_values": []
+ },
+ {
+ "id": 58,
+ "type": "VideoLoader",
+ "pos": [
+ -513.0911254882812,
+ 130.9906768798828
+ ],
+ "size": [
+ 430.6719665527344,
+ 452.4115295410156
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [
+ {
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": null
+ },
+ {
+ "name": "PATH",
+ "type": "PATH",
+ "links": [
+ 71
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VideoLoader",
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "19_raw.mp4",
+ "image"
+ ]
+ },
+ {
+ "id": 57,
+ "type": "MultiplePathsInput",
+ "pos": [
+ -49.730098724365234,
+ 137.55857849121094
+ ],
+ "size": [
+ 210,
+ 82
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "path_1",
+ "type": "PATH",
+ "link": 71
+ }
+ ],
+ "outputs": [
+ {
+ "name": "paths",
+ "type": "PATH",
+ "slot_index": 0,
+ "links": [
+ 70
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MultiplePathsInput",
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 1
+ ]
+ }
+ ],
+ "links": [
+ [
+ 70,
+ 57,
+ 0,
+ 56,
+ 0,
+ "PATH"
+ ],
+ [
+ 71,
+ 58,
+ 1,
+ 57,
+ 0,
+ "PATH"
+ ],
+ [
+ 72,
+ 56,
+ 0,
+ 59,
+ 0,
+ "*"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {
+ "ds": {
+ "scale": 0.9646149645000006,
+ "offset": [
+ 788.9511067206646,
+ 382.6344411516708
+ ]
+ },
+ "frontendVersion": "1.24.4",
+ "ue_links": [],
+ "links_added_by_ue": [],
+ "VHS_latentpreview": false,
+ "VHS_latentpreviewrate": 0,
+ "VHS_MetadataImage": true,
+ "VHS_KeepIntermediate": true
+ },
+ "version": 0.4
+}
+
+
+## 🚀 Launching ComfyUI
+```bash
+python main.py
+```
+Access the web interface at: `http://localhost:8188`
+
+## Post-Installation Configuration
+1. Replace the final component node with `Preview Any` in your workflow
+2. Reference model path: `./models/prompt_generator/Qwen2.5-VL-3B-Instruct/`
+
+
+
+## References
+- [ComfyUI GitHub](https://github.com/comfyanonymous/ComfyUI)
+- [Intel PyTorch XPU](https://intel.github.io/intel-extension-for-pytorch/)
+- [Qwen2.5 Model Card](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
+
diff --git a/vllm/webui/multi-modal-comfyui/pic/image.png b/vllm/webui/multi-modal-comfyui/pic/image.png
new file mode 100644
index 0000000..e9817a6
Binary files /dev/null and b/vllm/webui/multi-modal-comfyui/pic/image.png differ
diff --git a/vllm/webui/multi-modal-gradio/README.md b/vllm/webui/multi-modal-gradio/README.md
new file mode 100644
index 0000000..6e3f8c0
--- /dev/null
+++ b/vllm/webui/multi-modal-gradio/README.md
@@ -0,0 +1,89 @@
+# Qwen2.5-VL-7B-Instruct Multimodal Deployment Guide (Intel GPU/Docker/Gradio)
+
+
+
+
+
+## 📌 Core Components
+
+- **Model**: Qwen2.5-VL-7B-Instruct (vision-language multimodal)
+- **Inference Engine**: vLLM with Intel XPU optimizations
+- **Interface**: Gradio WebUI
+- **Deployment**: Docker containerized
+
+
+## 🚀 Quick Deployment
+
+### 1. Launch Docker Container
+```bash
+sudo docker run -td \
+ --privileged \
+ --net=host \
+ --device=/dev/dri \
+ --name=yourcontainername \
+ -v /home/intel/LLM:/llm/models/ \
+ -e no_proxy=localhost,127.0.0.1 \
+ -e http_proxy=$http_proxy \
+ -e https_proxy=$https_proxy \
+ --shm-size="32g" \
+ --entrypoint /bin/bash \
+ intel/llm-scaler-vllm:latest
+```
+
+### 2. Start vLLM Service Inside Container
+```bash
+docker exec -it yourcontainername bash
+```
+```bash
+TORCH_LLM_ALLREDUCE=1 \
+VLLM_USE_V1=1 \
+CCL_ZE_IPC_EXCHANGE=pidfd \
+VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+python3 -m vllm.entrypoints.openai.api_server \
+ --model /llm/models/Qwen2.5-VL-7B-Instruct \
+ --dtype=float16 \
+ --device=xpu \
+ --enforce-eager \
+ --port 8000 \
+ --host 0.0.0.0 \
+ --trust-remote-code \
+ --gpu-memory-util=0.9 \
+ --no-enable-prefix-caching \
+ --max-num-batched-tokens=8192 \
+ --disable-log-requests \
+ --max-model-len=32768 \
+ --block-size 64 \
+ --quantization fp8 \
+ -tp=2
+```
+
+### 3. Launch Gradio Interface on Host
+```bash
+conda create -n qwen_gradio python=3.11
+conda activate qwen_gradio
+pip install gradio
+```
+```bash
+python /llm-scaler/vllm/webui/multi-modal-gradio/main.py --model /llm/models/Qwen2.5-VL-7B-Instruct
+# The model need to be downloaded in advance to the directory
+```
+
+## 🌐 Access Interface
+Open in browser: `http://:8003`
+
+
+
+## 🔧 Key Parameters
+
+### vLLM Server Flags
+| Parameter | Purpose | Recommended Value |
+|-----------|---------|-------------------|
+| `--quantization fp8` | FP8 acceleration | Required |
+| `-tp=2` | Tensor parallelism | Adjust based on GPU count |
+| `--max-model-len` | Context window | 32768 |
+
+
+## 📜 License
+- Model: [Qwen License](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
+- Code: `Apache-2.0`
diff --git a/vllm/webui/multi-modal-gradio/main.py b/vllm/webui/multi-modal-gradio/main.py
new file mode 100644
index 0000000..e16f225
--- /dev/null
+++ b/vllm/webui/multi-modal-gradio/main.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import gradio as gr
+from openai import OpenAI, APIError
+from typing import List, Dict, Any, Optional, Tuple
+import os
+import base64
+from pathlib import Path
+import cv2
+import tempfile
+import shutil
+from uuid import uuid4
+
+VIDEO_TEMP_DIR = Path("gradio_temp_videos")
+if VIDEO_TEMP_DIR.exists():
+ shutil.rmtree(VIDEO_TEMP_DIR)
+VIDEO_TEMP_DIR.mkdir()
+
+parser = argparse.ArgumentParser(description='Multimodal Chatbot with Video Support')
+parser.add_argument('--model-url', type=str, default='http://localhost:8000/v1', help='Model URL')
+parser.add_argument('-m', '--model', type=str, required=True, help='Model name')
+parser.add_argument('--temp', type=float, default=0.8, help='Temperature for generation')
+parser.add_argument('--stop-token-ids', type=str, default='', help='Comma-separated stop token IDs')
+parser.add_argument("--host", type=str, default="127.0.0.1")
+parser.add_argument("--port", type=int, default=8003)
+args = parser.parse_args()
+
+
+client = OpenAI(api_key="EMPTY", base_url=args.model_url)
+
+
+def is_image_file(filename: str) -> bool:
+ image_exts = ['.jpg', '.jpeg', '.png', '.webp', '.bmp']
+ return any(filename.lower().endswith(ext) for ext in image_exts)
+
+def is_video_file(filename: str) -> bool:
+ video_exts = ['.mp4', '.avi', '.mkv', '.mov', '.webm']
+ return any(filename.lower().endswith(ext) for ext in video_exts)
+
+def encode_file_to_base64(filepath: str) -> str:
+ with open(filepath, "rb") as file:
+ return base64.b64encode(file.read()).decode('utf-8')
+
+def extract_frames_from_video(video_path: str, num_frames: int = 10) -> List[str]:
+ try:
+ video = cv2.VideoCapture(video_path)
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+ if total_frames <= 0: return []
+
+ # <--- MODIFIED: 修正了抽帧逻辑,使用均匀间隔的帧索引
+ frame_indices = [int(i) for i in (total_frames / (num_frames + 1) * (j + 1) for j in range(num_frames))]
+ temp_files = []
+
+ for frame_index in range(total_frames//15):
+ video.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+ success, frame = video.read()
+ if success:
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_f:
+ cv2.imwrite(temp_f.name, frame)
+ temp_files.append(temp_f.name)
+ video.release()
+ return temp_files
+ except Exception as e:
+ print(f"视频抽帧时发生错误: {e}")
+ return []
+
+def predict(messages: List[Dict[str, Any]]):
+ """
+ 调用模型API并以流式返回响应。
+ 新增了错误处理逻辑。
+ """
+ try:
+ response = client.chat.completions.create(
+ model=args.model,
+ messages=messages,
+ temperature=args.temp,
+ stream=True,
+ extra_body={
+ "repetition_penalty": 1.0,
+ "stop_token_ids": [int(id) for id in args.stop_token_ids.split(",") if id]
+ }
+ )
+ for chunk in response:
+ if chunk.choices[0].delta.content is not None:
+ yield chunk.choices[0].delta.content, False
+ except APIError as e:
+ error_message = f"抱歉,调用模型时出错: {e.message}"
+ if "longer than the maximum model length" in e.message:
+ error_message = "❌ **输入内容过长** ❌\n\n抱歉,您上传的文本、图片或视频帧的总长度超过了模型的处理上限。请尝试:\n\n- 缩短文字描述\n- 上传尺寸更小的图片\n- 截取更短时间的视频片段"
+
+ yield error_message, True
+
+
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+ gr.Markdown("# 🎥 Qwen2.5-VL-7B-Instruct Model Serving")
+
+
+ chatbot = gr.Chatbot(height=1200, label="Qwen2.5-VL-7B-Instruct", avatar_images=("👨", "🤖"), render_markdown=True)
+
+ upload_visible = gr.State(False)
+
+
+ def toggle_upload(visible):
+ new_visible = not visible
+ return new_visible, gr.Row(visible=new_visible)
+
+ with gr.Group():
+ with gr.Row(equal_height=True):
+ msg = gr.Textbox(
+ placeholder="输入消息...",
+ show_label=False,
+ container=False,
+ lines=2,
+ max_lines=8,
+ autofocus=True,
+ scale=95
+ )
+ attach_btn = gr.Button("📎", scale=5)
+
+ upload_row = gr.Row(visible=False)
+ with upload_row:
+ file_upload = gr.Files(
+ file_types=["image", "video"],
+ show_label=False,
+ container=False
+ )
+ attach_btn.click(
+ toggle_upload,
+ inputs=upload_visible,
+ outputs=[upload_visible, upload_row],
+ show_progress=False
+ )
+
+
+
+ with gr.Row():
+ submit_btn = gr.Button("🚀 提交", variant="primary")
+ clear_btn = gr.Button("🧹 清空")
+
+ api_history_state = gr.State([])
+
+ def user_and_bot_response(
+ gradio_history: List[Tuple[str, str]],
+ api_history: List[Dict[str, Any]],
+ user_message: str,
+ files: Optional[List[Any]]
+ ):
+ api_user_content = []
+ ui_display_string = ""
+
+ if user_message.strip():
+ api_user_content.append({"type": "text", "text": "用中文回答"+user_message.strip()})
+ ui_display_string += user_message.strip() + "\n\n"
+
+ if files:
+ for file in files:
+ filename = file.name
+
+ if is_image_file(filename):
+ base64_data = encode_file_to_base64(filename)
+ mime_type = f"image/{Path(filename).suffix[1:].lower()}"
+ data_url = f"data:{mime_type};base64,{base64_data}"
+ ui_display_string += f"\n"
+ api_user_content.append({"type": "image_url", "image_url": {"url": data_url}})
+
+ elif is_video_file(filename):
+ unique_filename = f"{uuid4()}{Path(filename).suffix}"
+ new_video_path = VIDEO_TEMP_DIR / unique_filename
+
+ shutil.copyfile(filename, new_video_path)
+ print("Successfully uploaded")
+
+ with open(new_video_path, "rb") as f:
+ base64_data = base64.b64encode(f.read()).decode()
+ ui_display_string += f""""""
+
+ print(ui_display_string)
+
+ frame_paths = extract_frames_from_video(str(new_video_path), num_frames=10)
+ if frame_paths:
+ for frame_path in frame_paths:
+ base64_data = encode_file_to_base64(frame_path)
+ api_user_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_data}"}})
+ os.unlink(frame_path)
+
+ if not api_user_content:
+ yield gradio_history, api_history
+ return
+ api_history.append({"role": "user", "content": api_user_content})
+ gradio_history.append((ui_display_string, None))
+ yield gradio_history, api_history
+ response_stream = predict(api_history)
+ full_response = ""
+ is_error = False
+ for partial_response, error_flag in response_stream:
+ full_response += partial_response
+ is_error = error_flag
+ gradio_history[-1] = (ui_display_string, full_response)
+ yield gradio_history, api_history
+ if is_error:
+ break
+ if is_error:
+ api_history.pop()
+ else:
+ api_history.append({"role": "assistant", "content": full_response})
+ yield gradio_history, api_history
+
+ def clear_history():
+ return [], []
+
+ submit_btn.click(
+ user_and_bot_response,
+ inputs=[chatbot, api_history_state, msg, file_upload],
+ outputs=[chatbot, api_history_state],
+ queue=True
+ ).then(
+ lambda: (gr.Textbox(value=""), gr.Files(value=None)),
+ None,
+ [msg, file_upload],
+ queue=False
+ )
+
+ clear_btn.click(
+ clear_history,
+ None,
+ [chatbot, api_history_state],
+ queue=True
+ )
+
+if __name__ == "__main__":
+ demo.queue().launch(
+ server_name=args.host,
+ server_port=args.port,
+ share=True,
+ allowed_paths=[str(VIDEO_TEMP_DIR)]
+ )
\ No newline at end of file
diff --git a/vllm/webui/multi-modal-gradio/pic/image.png b/vllm/webui/multi-modal-gradio/pic/image.png
new file mode 100644
index 0000000..25d2319
Binary files /dev/null and b/vllm/webui/multi-modal-gradio/pic/image.png differ