diff --git a/vllm/README.md b/vllm/README.md index 8c80c85..93e4a6c 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -21,6 +21,7 @@ llm-scaler-vllm is an extended and optimized version of vLLM, specifically adapt 2.5 [Omni Model Support](#25-omni-model-support) 2.6 [Data Parallelism (DP)](#26-data-parallelism-dp) 2.7 [Finding maximum Context Length](#27-finding-maximum-context-length) + 2.8 [Multi-Modal Webui](#28-multi-modal-webui) 3. [Supported Models](#3-supported-models) 4. [Troubleshooting](#4-troubleshooting) 5. [Performance tuning](#5-performance-tuning) @@ -2314,6 +2315,33 @@ In this case, you should adjust the launch command with: --max-model-len 114432 ``` +### 2.8 Multi-Modal Webui +The project provides two optimized interfaces for interacting with Qwen2.5-VL models: + + +#### 📌 Core Components +- **Inference Engine**: vLLM (Intel-optimized) +- **Interfaces**: + - Gradio (for rapid prototyping) + - ComfyUI (for complex workflows) + +#### 🚀 Deployment Options + +#### Option 1: Gradio Deployment (Recommended for Most Users) +- check `/llm-scaler/vllm/webui/multi-modal-gradio/README.md` for implementation details + +#### Option 2: ComfyUI Deployment (Advanced Workflows) +- check `/llm-scaler/vllm/webui/multi-modal-comfyui/README.md` for implementation details + + +#### 🔧 Configuration Guide + +| Parameter | Effect | Recommended Value | +|-----------|--------|-------------------| +| `--quantization fp8` | XPU acceleration | Required | +| `-tp=2` | Tensor parallelism | Match GPU count | +| `--max-model-len` | Context window | 32768 (max) | + --- ## 3. Supported Models diff --git a/vllm/webui/multi-modal-comfyui/README.md b/vllm/webui/multi-modal-comfyui/README.md new file mode 100644 index 0000000..63203a3 --- /dev/null +++ b/vllm/webui/multi-modal-comfyui/README.md @@ -0,0 +1,284 @@ +# Qwen2.5-VL-3B-Instruct Deployment Guide (ComfyUI + Intel GPU + Linux) + +This document provides comprehensive instructions for deploying the `Qwen2.5-VL-3.5B-Instruct` multimodal LLM on Linux systems with `Intel GPU` acceleration via `ComfyUI` workflow. + +## 🛠️ Installation Procedure +### 1. Environment Setup +```bash +# Install system dependencies +sudo apt update && sudo apt install -y \ + git python3-pip python3-venv \ + ocl-icd-opencl-dev + +# Configure Intel GPU drivers (if not present) +sudo apt install -y \ + intel-opencl-icd \ + intel-level-zero-gpu \ + level-zero +``` + +### 2. Conda Environment Configuration +```bash +conda create -n comfyqwen python=3.11 +conda activate comfyqwen +``` + +### 3. ComfyUI Installation +```bash +git clone https://github.com/comfyanonymous/ComfyUI.git +cd ./ComfyUI + +# Install Intel-optimized PyTorch +pip install torch torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/xpu + +# For nightly builds with potential performance improvements: +# pip install --pre torch torchvision torchaudio \ +# --index-url https://download.pytorch.org/whl/nightly/xpu + +pip install -r requirements.txt +``` + +### 4. Qwen2.5-VL Custom Node Deployment +```bash +# Download node definition files +git clone https://github.com/IuvenisSapiens/ComfyUI_Qwen2_5-VL-Instruct + +Move the ComfyUI_Qwen2_5-VL-Instruct folder into /ComfyUI/custom_nodes/ directory + +Place the downloaded Qwen2.5-VL-3B-Instruct model folder into /ComfyUI/models/prompt_generator/ +# If prompt_generator subdirectory doesn't exist under models, please create it first +``` +
ComfyUI_Qwen2_5-VL-Instruct_workflow.json +{ + "id": "9f2dfc63-3d19-433d-a7c0-49d83464f553", + "revision": 0, + "last_node_id": 59, + "last_link_id": 72, + "nodes": [ + { + "id": 56, + "type": "Qwen2_VQA", + "pos": [ + 199.93017578125, + 46.947696685791016 + ], + "size": [ + 322.1059265136719, + 348 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "source_path", + "shape": 7, + "type": "PATH", + "link": 70 + }, + { + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + } + ], + "outputs": [ + { + "name": "STRING", + "type": "STRING", + "slot_index": 0, + "links": [ + 72 + ] + } + ], + "properties": { + "Node name for S&R": "Qwen2_VQA", + "widget_ue_connectable": {} + }, + "widgets_values": [ + "Describe the video in detail", + "Qwen2.5-VL-3B-Instruct", + "none", + false, + 0.7, + 2048, + 200704, + 1003520, + 1444, + "randomize", + "eager" + ] + }, + { + "id": 59, + "type": "PreviewAny", + "pos": [ + 702.7207641601562, + 61.4115104675293 + ], + "size": [ + 140, + 76 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "source", + "type": "*", + "link": 72 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [] + }, + { + "id": 58, + "type": "VideoLoader", + "pos": [ + -513.0911254882812, + 130.9906768798828 + ], + "size": [ + 430.6719665527344, + 452.4115295410156 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "VIDEO", + "type": "VIDEO", + "links": null + }, + { + "name": "PATH", + "type": "PATH", + "links": [ + 71 + ] + } + ], + "properties": { + "Node name for S&R": "VideoLoader", + "widget_ue_connectable": {} + }, + "widgets_values": [ + "19_raw.mp4", + "image" + ] + }, + { + "id": 57, + "type": "MultiplePathsInput", + "pos": [ + -49.730098724365234, + 137.55857849121094 + ], + "size": [ + 210, + 82 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "path_1", + "type": "PATH", + "link": 71 + } + ], + "outputs": [ + { + "name": "paths", + "type": "PATH", + "slot_index": 0, + "links": [ + 70 + ] + } + ], + "properties": { + "Node name for S&R": "MultiplePathsInput", + "widget_ue_connectable": {} + }, + "widgets_values": [ + 1 + ] + } + ], + "links": [ + [ + 70, + 57, + 0, + 56, + 0, + "PATH" + ], + [ + 71, + 58, + 1, + 57, + 0, + "PATH" + ], + [ + 72, + 56, + 0, + 59, + 0, + "*" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.9646149645000006, + "offset": [ + 788.9511067206646, + 382.6344411516708 + ] + }, + "frontendVersion": "1.24.4", + "ue_links": [], + "links_added_by_ue": [], + "VHS_latentpreview": false, + "VHS_latentpreviewrate": 0, + "VHS_MetadataImage": true, + "VHS_KeepIntermediate": true + }, + "version": 0.4 +} +
+ +## 🚀 Launching ComfyUI +```bash +python main.py +``` +Access the web interface at: `http://localhost:8188` + +## Post-Installation Configuration +1. Replace the final component node with `Preview Any` in your workflow +2. Reference model path: `./models/prompt_generator/Qwen2.5-VL-3B-Instruct/` + +![Workflow Example](pic/image.png) + +## References +- [ComfyUI GitHub](https://github.com/comfyanonymous/ComfyUI) +- [Intel PyTorch XPU](https://intel.github.io/intel-extension-for-pytorch/) +- [Qwen2.5 Model Card](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) + diff --git a/vllm/webui/multi-modal-comfyui/pic/image.png b/vllm/webui/multi-modal-comfyui/pic/image.png new file mode 100644 index 0000000..e9817a6 Binary files /dev/null and b/vllm/webui/multi-modal-comfyui/pic/image.png differ diff --git a/vllm/webui/multi-modal-gradio/README.md b/vllm/webui/multi-modal-gradio/README.md new file mode 100644 index 0000000..6e3f8c0 --- /dev/null +++ b/vllm/webui/multi-modal-gradio/README.md @@ -0,0 +1,89 @@ +# Qwen2.5-VL-7B-Instruct Multimodal Deployment Guide (Intel GPU/Docker/Gradio) + +![Intel XPU](https://img.shields.io/badge/Accelerator-Intel%20GPU-green) +![Docker](https://img.shields.io/badge/Container-Docker-2496ED) +![Gradio](https://img.shields.io/badge/GUI-Gradio-FF4B4B) + +## 📌 Core Components + +- **Model**: Qwen2.5-VL-7B-Instruct (vision-language multimodal) +- **Inference Engine**: vLLM with Intel XPU optimizations +- **Interface**: Gradio WebUI +- **Deployment**: Docker containerized + + +## 🚀 Quick Deployment + +### 1. Launch Docker Container +```bash +sudo docker run -td \ + --privileged \ + --net=host \ + --device=/dev/dri \ + --name=yourcontainername \ + -v /home/intel/LLM:/llm/models/ \ + -e no_proxy=localhost,127.0.0.1 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + --shm-size="32g" \ + --entrypoint /bin/bash \ + intel/llm-scaler-vllm:latest +``` + +### 2. Start vLLM Service Inside Container +```bash +docker exec -it yourcontainername bash +``` +```bash +TORCH_LLM_ALLREDUCE=1 \ +VLLM_USE_V1=1 \ +CCL_ZE_IPC_EXCHANGE=pidfd \ +VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python3 -m vllm.entrypoints.openai.api_server \ + --model /llm/models/Qwen2.5-VL-7B-Instruct \ + --dtype=float16 \ + --device=xpu \ + --enforce-eager \ + --port 8000 \ + --host 0.0.0.0 \ + --trust-remote-code \ + --gpu-memory-util=0.9 \ + --no-enable-prefix-caching \ + --max-num-batched-tokens=8192 \ + --disable-log-requests \ + --max-model-len=32768 \ + --block-size 64 \ + --quantization fp8 \ + -tp=2 +``` + +### 3. Launch Gradio Interface on Host +```bash +conda create -n qwen_gradio python=3.11 +conda activate qwen_gradio +pip install gradio +``` +```bash +python /llm-scaler/vllm/webui/multi-modal-gradio/main.py --model /llm/models/Qwen2.5-VL-7B-Instruct +# The model need to be downloaded in advance to the directory +``` + +## 🌐 Access Interface +Open in browser: `http://:8003` + +![alt text](pic/image.png) + +## 🔧 Key Parameters + +### vLLM Server Flags +| Parameter | Purpose | Recommended Value | +|-----------|---------|-------------------| +| `--quantization fp8` | FP8 acceleration | Required | +| `-tp=2` | Tensor parallelism | Adjust based on GPU count | +| `--max-model-len` | Context window | 32768 | + + +## 📜 License +- Model: [Qwen License](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) +- Code: `Apache-2.0` diff --git a/vllm/webui/multi-modal-gradio/main.py b/vllm/webui/multi-modal-gradio/main.py new file mode 100644 index 0000000..e16f225 --- /dev/null +++ b/vllm/webui/multi-modal-gradio/main.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +import argparse +import gradio as gr +from openai import OpenAI, APIError +from typing import List, Dict, Any, Optional, Tuple +import os +import base64 +from pathlib import Path +import cv2 +import tempfile +import shutil +from uuid import uuid4 + +VIDEO_TEMP_DIR = Path("gradio_temp_videos") +if VIDEO_TEMP_DIR.exists(): + shutil.rmtree(VIDEO_TEMP_DIR) +VIDEO_TEMP_DIR.mkdir() + +parser = argparse.ArgumentParser(description='Multimodal Chatbot with Video Support') +parser.add_argument('--model-url', type=str, default='http://localhost:8000/v1', help='Model URL') +parser.add_argument('-m', '--model', type=str, required=True, help='Model name') +parser.add_argument('--temp', type=float, default=0.8, help='Temperature for generation') +parser.add_argument('--stop-token-ids', type=str, default='', help='Comma-separated stop token IDs') +parser.add_argument("--host", type=str, default="127.0.0.1") +parser.add_argument("--port", type=int, default=8003) +args = parser.parse_args() + + +client = OpenAI(api_key="EMPTY", base_url=args.model_url) + + +def is_image_file(filename: str) -> bool: + image_exts = ['.jpg', '.jpeg', '.png', '.webp', '.bmp'] + return any(filename.lower().endswith(ext) for ext in image_exts) + +def is_video_file(filename: str) -> bool: + video_exts = ['.mp4', '.avi', '.mkv', '.mov', '.webm'] + return any(filename.lower().endswith(ext) for ext in video_exts) + +def encode_file_to_base64(filepath: str) -> str: + with open(filepath, "rb") as file: + return base64.b64encode(file.read()).decode('utf-8') + +def extract_frames_from_video(video_path: str, num_frames: int = 10) -> List[str]: + try: + video = cv2.VideoCapture(video_path) + total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + if total_frames <= 0: return [] + + # <--- MODIFIED: 修正了抽帧逻辑,使用均匀间隔的帧索引 + frame_indices = [int(i) for i in (total_frames / (num_frames + 1) * (j + 1) for j in range(num_frames))] + temp_files = [] + + for frame_index in range(total_frames//15): + video.set(cv2.CAP_PROP_POS_FRAMES, frame_index) + success, frame = video.read() + if success: + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_f: + cv2.imwrite(temp_f.name, frame) + temp_files.append(temp_f.name) + video.release() + return temp_files + except Exception as e: + print(f"视频抽帧时发生错误: {e}") + return [] + +def predict(messages: List[Dict[str, Any]]): + """ + 调用模型API并以流式返回响应。 + 新增了错误处理逻辑。 + """ + try: + response = client.chat.completions.create( + model=args.model, + messages=messages, + temperature=args.temp, + stream=True, + extra_body={ + "repetition_penalty": 1.0, + "stop_token_ids": [int(id) for id in args.stop_token_ids.split(",") if id] + } + ) + for chunk in response: + if chunk.choices[0].delta.content is not None: + yield chunk.choices[0].delta.content, False + except APIError as e: + error_message = f"抱歉,调用模型时出错: {e.message}" + if "longer than the maximum model length" in e.message: + error_message = "❌ **输入内容过长** ❌\n\n抱歉,您上传的文本、图片或视频帧的总长度超过了模型的处理上限。请尝试:\n\n- 缩短文字描述\n- 上传尺寸更小的图片\n- 截取更短时间的视频片段" + + yield error_message, True + + +with gr.Blocks(theme=gr.themes.Soft()) as demo: + gr.Markdown("# 🎥 Qwen2.5-VL-7B-Instruct Model Serving") + + + chatbot = gr.Chatbot(height=1200, label="Qwen2.5-VL-7B-Instruct", avatar_images=("👨", "🤖"), render_markdown=True) + + upload_visible = gr.State(False) + + + def toggle_upload(visible): + new_visible = not visible + return new_visible, gr.Row(visible=new_visible) + + with gr.Group(): + with gr.Row(equal_height=True): + msg = gr.Textbox( + placeholder="输入消息...", + show_label=False, + container=False, + lines=2, + max_lines=8, + autofocus=True, + scale=95 + ) + attach_btn = gr.Button("📎", scale=5) + + upload_row = gr.Row(visible=False) + with upload_row: + file_upload = gr.Files( + file_types=["image", "video"], + show_label=False, + container=False + ) + attach_btn.click( + toggle_upload, + inputs=upload_visible, + outputs=[upload_visible, upload_row], + show_progress=False + ) + + + + with gr.Row(): + submit_btn = gr.Button("🚀 提交", variant="primary") + clear_btn = gr.Button("🧹 清空") + + api_history_state = gr.State([]) + + def user_and_bot_response( + gradio_history: List[Tuple[str, str]], + api_history: List[Dict[str, Any]], + user_message: str, + files: Optional[List[Any]] + ): + api_user_content = [] + ui_display_string = "" + + if user_message.strip(): + api_user_content.append({"type": "text", "text": "用中文回答"+user_message.strip()}) + ui_display_string += user_message.strip() + "\n\n" + + if files: + for file in files: + filename = file.name + + if is_image_file(filename): + base64_data = encode_file_to_base64(filename) + mime_type = f"image/{Path(filename).suffix[1:].lower()}" + data_url = f"data:{mime_type};base64,{base64_data}" + ui_display_string += f"![{os.path.basename(filename)}]({data_url})\n" + api_user_content.append({"type": "image_url", "image_url": {"url": data_url}}) + + elif is_video_file(filename): + unique_filename = f"{uuid4()}{Path(filename).suffix}" + new_video_path = VIDEO_TEMP_DIR / unique_filename + + shutil.copyfile(filename, new_video_path) + print("Successfully uploaded") + + with open(new_video_path, "rb") as f: + base64_data = base64.b64encode(f.read()).decode() + ui_display_string += f"""""" + + print(ui_display_string) + + frame_paths = extract_frames_from_video(str(new_video_path), num_frames=10) + if frame_paths: + for frame_path in frame_paths: + base64_data = encode_file_to_base64(frame_path) + api_user_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_data}"}}) + os.unlink(frame_path) + + if not api_user_content: + yield gradio_history, api_history + return + api_history.append({"role": "user", "content": api_user_content}) + gradio_history.append((ui_display_string, None)) + yield gradio_history, api_history + response_stream = predict(api_history) + full_response = "" + is_error = False + for partial_response, error_flag in response_stream: + full_response += partial_response + is_error = error_flag + gradio_history[-1] = (ui_display_string, full_response) + yield gradio_history, api_history + if is_error: + break + if is_error: + api_history.pop() + else: + api_history.append({"role": "assistant", "content": full_response}) + yield gradio_history, api_history + + def clear_history(): + return [], [] + + submit_btn.click( + user_and_bot_response, + inputs=[chatbot, api_history_state, msg, file_upload], + outputs=[chatbot, api_history_state], + queue=True + ).then( + lambda: (gr.Textbox(value=""), gr.Files(value=None)), + None, + [msg, file_upload], + queue=False + ) + + clear_btn.click( + clear_history, + None, + [chatbot, api_history_state], + queue=True + ) + +if __name__ == "__main__": + demo.queue().launch( + server_name=args.host, + server_port=args.port, + share=True, + allowed_paths=[str(VIDEO_TEMP_DIR)] + ) \ No newline at end of file diff --git a/vllm/webui/multi-modal-gradio/pic/image.png b/vllm/webui/multi-modal-gradio/pic/image.png new file mode 100644 index 0000000..25d2319 Binary files /dev/null and b/vllm/webui/multi-modal-gradio/pic/image.png differ