From 2f62bc2879bf53f50b58708fb06d295e8ac603c9 Mon Sep 17 00:00:00 2001 From: liu-shaojun Date: Thu, 19 Jun 2025 16:13:48 +0800 Subject: [PATCH] feat: initial commit with Dockerfile, patches, and tools - Add Dockerfile for containerized environment setup - Include initial patches directory for source modifications - Add tools for build, test, or deployment workflows --- vllm/docker/Dockerfile | 107 + .../0001-oneccl-align-global-V0.1.1.patch | 125 + ...api-samples-enable-correctness-check.patch | 118 + vllm/patches/vllm_for_multi_arc.patch | 5782 +++++++++++++++++ vllm/tools/download_tool.sh | 5 + vllm/tools/native_bkc_setup.sh | 149 + 6 files changed, 6286 insertions(+) create mode 100644 vllm/docker/Dockerfile create mode 100644 vllm/patches/0001-oneccl-align-global-V0.1.1.patch create mode 100644 vllm/patches/oneapi-samples-enable-correctness-check.patch create mode 100644 vllm/patches/vllm_for_multi_arc.patch create mode 100644 vllm/tools/download_tool.sh create mode 100644 vllm/tools/native_bkc_setup.sh diff --git a/vllm/docker/Dockerfile b/vllm/docker/Dockerfile new file mode 100644 index 0000000..15c1f08 --- /dev/null +++ b/vllm/docker/Dockerfile @@ -0,0 +1,107 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# ======== Base Stage ======== +FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base + +ARG https_proxy +ARG http_proxy + +# Add Intel oneAPI repo and PPA for GPU support +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + add-apt-repository -y ppa:kobuk-team/intel-graphics-testing + +# Install dependencies and Python 3.10 +RUN apt-get update -y && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update -y && \ + apt-get install -y python3.10 python3.10-distutils && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ + apt-get install -y --no-install-recommends --fix-missing \ + curl \ + ffmpeg \ + git \ + libsndfile1 \ + libsm6 \ + libxext6 \ + libgl1 \ + lsb-release \ + numactl \ + wget \ + vim \ + linux-libc-dev && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ + # Install Intel GPU runtime packages + apt-get update -y && \ + apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /llm +COPY ./patches/vllm_for_multi_arc.patch /tmp/ +COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/ + +# Set environment variables early +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" +ENV VLLM_TARGET_DEVICE=xpu +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn + +# Clone + patch vllm +RUN git clone -b v0.8.3 https://github.com/vllm-project/vllm.git && \ + cd vllm && \ + git apply /tmp/vllm_for_multi_arc.patch && \ + pip install --no-cache-dir -r requirements/xpu.txt && \ + python3 setup.py install + +# ======= Add oneCCL build ======= +RUN apt-get update && apt-get install -y \ + cmake \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Build 1ccl +RUN git clone https://github.com/oneapi-src/oneCCL.git && \ + cd oneCCL && \ + git checkout def870543749186b6f38cdc865b44d52174c7492 && \ + git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \ + mkdir build && cd build && \ + export IGC_VISAOptions=-activeThreadsOnlyBarrier && \ + /usr/bin/cmake .. \ + -DCMAKE_INSTALL_PREFIX=_install \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DCOMPUTE_BACKEND=dpcpp \ + -DCCL_ENABLE_ARCB=1 && \ + make -j && make install && \ + mv _install /opt/intel/oneapi/ccl/2021.15.3 && \ + cd /opt/intel/oneapi/ccl/ && \ + ln -snf 2021.15.3 latest && \ + source /opt/intel/oneapi/setvars.sh --force + +WORKDIR /llm/vllm + +# Cleanup patch file +RUN rm -rf /tmp/* + +CMD ["/bin/bash"] + + + +# ======== OpenAI Serving Stage ======== +FROM vllm-base AS vllm-openai + +ARG http_proxy +ARG https_proxy + +# install additional dependencies for openai api server +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' + +# Set additional environment for production usage +ENV VLLM_USAGE_SOURCE=production-docker-image +ENV TRITON_XPU_PROFILE=1 + +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/vllm/patches/0001-oneccl-align-global-V0.1.1.patch b/vllm/patches/0001-oneccl-align-global-V0.1.1.patch new file mode 100644 index 0000000..8f8a987 --- /dev/null +++ b/vllm/patches/0001-oneccl-align-global-V0.1.1.patch @@ -0,0 +1,125 @@ +From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001 +From: YongZhuIntel +Date: Wed, 21 May 2025 09:37:06 +0800 +Subject: [PATCH] oneccl align global V0.1.1 + +base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492) + +Build: + 1. mkdir build; cd build + 2. source /opt/intel/oneapi/setvars.sh + 3. export IGC_VISAOptions=-activeThreadsOnlyBarrier + 4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install + +print bandwidth in benchmark +--- + examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++--- + examples/benchmark/src/benchmark.cpp | 7 +++-- + 2 files changed, 41 insertions(+), 6 deletions(-) + +diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp +index 08a3625..bff6275 100644 +--- a/examples/benchmark/include/benchmark.hpp ++++ b/examples/benchmark/include/benchmark.hpp +@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options, + double max_time, + double avg_time, + double stddev, +- double wait_avg_time) { ++ double wait_avg_time, ++ double algbw, ++ double busbw) { + std::ofstream csvf; + csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app); + +@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options, + << "," << ccl::get_datatype_size(dtype) << "," << elem_count << "," + << ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << "," + << iter_count << "," << min_time << "," << max_time << "," << avg_time << "," +- << stddev << "," << wait_avg_time << std::endl; ++ << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl; + } + csvf.close(); + } +@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm, + max_time /= iter_count; + + size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count; ++ ++ double algbw = bytes*1000/total_avg_time/1024/1024; ++ ++ if (ncolls == 1) { ++ if (options.coll_names.front() == "allgather" || ++ options.coll_names.front() == "allgatherv" || ++ options.coll_names.front() == "reducescatter" || ++ options.coll_names.front() == "alltoall" || ++ options.coll_names.front() == "alltoallv") { ++ algbw = algbw * nranks; ++ } ++ } ++ ++ double busbw = algbw; ++ if (ncolls == 1) { ++ if (options.coll_names.front() == "allreduce") { ++ busbw = algbw * 2 * (nranks -1) / nranks; ++ } else if (options.coll_names.front() == "allgather" || ++ options.coll_names.front() == "allgatherv" || ++ options.coll_names.front() == "reducescatter" || ++ options.coll_names.front() == "alltoall" || ++ options.coll_names.front() == "alltoallv") { ++ busbw = algbw * (nranks -1) / nranks; ++ } ++ } ++ + std::stringstream ss; + ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH) + << elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH) + << std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH) + << std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH) + << std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3) +- << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3); ++ << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH) ++ << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH) ++ << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3); + + if (show_extened_info(options.show_additional_info)) { + ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time; +@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm, + max_time, + total_avg_time, + stddev, +- wait_avg_time); ++ wait_avg_time, ++ algbw, ++ busbw); + } + } + +diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp +index d90fb9b..78957f2 100644 +--- a/examples/benchmark/src/benchmark.cpp ++++ b/examples/benchmark/src/benchmark.cpp +@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm, + << "#elem_count" << std::setw(COL_WIDTH) << "#repetitions" + << std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]" + << std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3) +- << "stddev[%]"; ++ << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH) ++ << "busbw[GB/s]"; + + if (show_extened_info(options.show_additional_info)) { + ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]"; +@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) { + << "t_max[usec]," + << "t_avg[usec]," + << "stddev[%]," +- << "wait_t_avg[usec]" << std::endl; ++ << "wait_t_avg[usec]," ++ << "algbw[GB/s]," ++ << "busbw[GB/s]" << std::endl; + csvf.close(); + } + +-- +2.25.1 + diff --git a/vllm/patches/oneapi-samples-enable-correctness-check.patch b/vllm/patches/oneapi-samples-enable-correctness-check.patch new file mode 100644 index 0000000..0bdaf0a --- /dev/null +++ b/vllm/patches/oneapi-samples-enable-correctness-check.patch @@ -0,0 +1,118 @@ +From 592d83da1e8d00b4436b51ccf17e7d0bc9564e24 Mon Sep 17 00:00:00 2001 +From: jilongW <109333127+jilongW@users.noreply.github.com> +Date: Fri, 6 Jun 2025 14:27:43 +0800 +Subject: [PATCH] add compare (#1) + +* add comparision + +* update compare + +* remove extra file +--- + .../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp | 66 ++++++++++++++++--- + 1 file changed, 56 insertions(+), 10 deletions(-) + +diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp +index 427505b7..989c00ad 100644 +--- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp ++++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp +@@ -41,17 +41,55 @@ bool test(queue &Q, int M, int N, int K) + constexpr int rd_size = 1048576; + std::vector host_vector(rd_size); + auto host_data = host_vector.data(); +- ++ std::vector correct_host_vector(rd_size); ++ auto correct_host_data = correct_host_vector.data(); + /* Measure time for a given number of GEMM calls */ +- auto time_gemms = [=, &Q](int runs) -> double { ++ bool verify = false; ++ auto time_gemms = [=, &Q, &host_data](int runs, bool verify=false) -> std::tuple { + using namespace oneapi::mkl; + using namespace std::chrono; + auto start = steady_clock::now(); +- for (int i = 0; i < runs; i++) ++ int ok = 0; ++ if (verify == false){ ++ for (int i = 0; i < runs; i++) ++ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc); ++ Q.wait_and_throw(); ++ auto end = steady_clock::now(); ++ return std::make_tuple(duration(end - start).count(), ok); ++ } ++ else{ ++ size_t elems = std::min(ldc * N, rd_size); ++ + blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc); +- Q.wait_and_throw(); +- auto end = steady_clock::now(); +- return duration(end - start).count(); ++ Q.wait_and_throw(); ++ Q.copy(C, correct_host_data, elems).wait(); ++ auto end = steady_clock::now(); ++ auto used_time = duration(end - start).count(); ++ ++ // correct_host_data[0] += 1.0; ++ for (int i = 1; i < runs; i++){ ++ start = steady_clock::now(); ++ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc); ++ Q.wait_and_throw(); ++ end = steady_clock::now(); ++ used_time += duration(end - start).count(); ++ Q.copy(C, host_data, elems).wait(); ++ int linear_id = 0; ++ for (size_t j = 0; j < N; j++) { ++ for (size_t k = 0; k < M; k++) { ++ linear_id = j*ldc + k; ++ if (linear_id >= elems) break; ++ if (host_data[linear_id] != correct_host_data[linear_id]) { ++ ok = i; ++ return std::make_tuple(duration(end - start).count(), ok); ++ } ++ } ++ if (linear_id >= elems) break; ++ } ++ ++ } ++ return std::make_tuple(used_time, ok); ++ } + }; + + /* Fill A/B with all ones to verify correctness */ +@@ -91,13 +129,15 @@ bool test(queue &Q, int M, int N, int K) + + /* Time one GEMM call, and estimate how many calls will be required to keep the + * GPU busy for 1s. */ +- auto tare = time_gemms(1); ++ auto [tare, _] = time_gemms(1, true); + int ncalls = std::max(4, std::min(1000, int(1. / tare))); + + /* Time that many GEMMs, subtracting the first call time to remove host overhead. + * This gives a better idea of device performance. */ + std::cout << " -> Timing...\n"; +- auto time = time_gemms(ncalls + 1) - tare; ++ auto [time, result] = time_gemms(ncalls + 1, true); ++ time -= tare; ++ + auto avg = time / ncalls; + + /* Calculate and display performance */ +@@ -114,8 +154,14 @@ bool test(queue &Q, int M, int N, int K) + flops *= 1e-3; + unit = 'P'; + } +- +- std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n"; ++ if (result != 0){ ++ std::cout << "gemm FAILS" << " for type: " << type_string() << " on " << result <<" times run!"<< "\n"; ++ } ++ else{ ++ std::cout << "gemm Passes" << " for type: " << type_string() << "!\n"; ++ std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n"; ++ } ++ + + /* Free data */ + free(C, Q); +-- +2.34.1 + diff --git a/vllm/patches/vllm_for_multi_arc.patch b/vllm/patches/vllm_for_multi_arc.patch new file mode 100644 index 0000000..2f47b4e --- /dev/null +++ b/vllm/patches/vllm_for_multi_arc.patch @@ -0,0 +1,5782 @@ +diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +index 65be3c5d9..7e4808151 100644 +--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh ++++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do + done + + lm_eval --model vllm \ +- --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ ++ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,enforce_eager=true,max_num_batched_tokens=4096" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" +diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh +index f54010c42..827649bfc 100644 +--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh ++++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh +@@ -28,4 +28,5 @@ docker run \ + sh -c ' + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 ++ VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + ' +diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py +index 59648222e..fd3ce6505 100644 +--- a/benchmarks/benchmark_serving.py ++++ b/benchmarks/benchmark_serving.py +@@ -267,10 +267,10 @@ async def benchmark( + raise ValueError(f"Unknown backend: {backend}") + + print("Starting initial single prompt test run...") ++ # set test_output_len=10 to avoid long prompt test run + test_prompt, test_prompt_len, test_output_len, test_mm_content = \ + input_requests[0].prompt, input_requests[0].prompt_len, \ +- input_requests[0].expected_output_len, \ +- input_requests[0].multi_modal_data ++ 10, input_requests[0].multi_modal_data + + if backend != "openai-chat" and test_mm_content is not None: + # multi-modal benchmark is only available on OpenAI Chat backend. +diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py +index d0d7dfa1d..5f2f730c2 100644 +--- a/benchmarks/benchmark_throughput.py ++++ b/benchmarks/benchmark_throughput.py +@@ -35,6 +35,7 @@ def run_vllm( + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, ++ do_profile: bool = False, + ) -> tuple[float, Optional[list[RequestOutput]]]: + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) +@@ -72,11 +73,15 @@ def run_vllm( + outputs = None + if not use_beam_search: + start = time.perf_counter() ++ if do_profile: ++ llm.start_profile() + outputs = llm.generate(prompts, + sampling_params, + lora_request=lora_requests, + use_tqdm=True) + end = time.perf_counter() ++ if do_profile: ++ llm.stop_profile() + else: + assert lora_requests is None, "BeamSearch API does not support LoRA" + prompts = [request.prompt for request in requests] +@@ -368,7 +373,7 @@ def main(args: argparse.Namespace): + else: + elapsed_time, request_outputs = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), +- args.disable_detokenize) ++ args.disable_detokenize, args.profile) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + elapsed_time = run_hf(requests, args.model, tokenizer, args.n, +@@ -562,6 +567,10 @@ if __name__ == "__main__": + type=int, + default=1000, + help="Number of prompts to process.") ++ parser.add_argument("--profile", ++ action='store_true', ++ default=False, ++ help="whether run with profiler.") + parser.add_argument("--hf-max-batch-size", + type=int, + default=None, +diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu +index ad4abf16b..80cce43e4 100644 +--- a/docker/Dockerfile.xpu ++++ b/docker/Dockerfile.xpu +@@ -1,9 +1,14 @@ +-# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. +-FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base ++FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base + +-RUN rm /etc/apt/sources.list.d/intel-graphics.list ++RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ ++ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ ++ add-apt-repository -y ppa:kobuk-team/intel-graphics-testing + + RUN apt-get update -y && \ ++ apt-get install -y software-properties-common && \ ++ add-apt-repository ppa:deadsnakes/ppa && \ ++ apt-get install -y python3.10 python3.10-distutils && \ ++ curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ + apt-get install -y --no-install-recommends --fix-missing \ + curl \ + ffmpeg \ +@@ -14,11 +19,16 @@ RUN apt-get update -y && \ + libgl1 \ + lsb-release \ + numactl \ +- python3 \ +- python3-dev \ +- python3-pip \ + wget + ++RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 ++ ++RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing ++RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.3/intel-oneccl-2021.15.3.11_offline.sh ++RUN bash intel-oneccl-2021.15.3.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc ++SHELL ["bash", "-c"] ++CMD ["bash", "-c", "source /root/.bashrc && exec bash"] ++ + WORKDIR /workspace/vllm + COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt + COPY requirements/common.txt /workspace/vllm/requirements/common.txt +@@ -35,17 +45,12 @@ RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi + + ENV VLLM_TARGET_DEVICE=xpu ++ENV VLLM_WORKER_MULTIPROC_METHOD=spawn + + RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + python3 setup.py install + +-# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu +-# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. +-RUN --mount=type=cache,target=/root/.cache/pip \ +- pip install intel-extension-for-pytorch==2.6.10+xpu \ +- --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +- + CMD ["/bin/bash"] + + FROM vllm-base AS vllm-openai +diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md +index ec1f3cb8d..ec661d8ec 100644 +--- a/docs/source/design/v1/prefix_caching.md ++++ b/docs/source/design/v1/prefix_caching.md +@@ -16,7 +16,7 @@ In the example above, the KV cache in the first block can be uniquely identified + + * Parent hash value: The hash value of the parent hash block. + * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision. +-* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below). ++* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments. + + > **Note 1:** We only cache full blocks. + +@@ -76,6 +76,24 @@ Block 3 + + In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow. + ++**Cache Isolation for Security** ++To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance. ++ ++```json ++{ ++ "messages": [ ++ {"role": "system", "content": "You are a helpful assistant."}, ++ {"role": "user", "content": "Here is a document with details about the world series: ..."}, ++ {"role": "user", "content": "Who won the world series in 2020?"} ++ ], ++ "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==" ++} ++``` ++ ++With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others. ++ ++> **Note:** Cache isolation is not supported in engine V0. ++ + ## Data Structure + + The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified): +diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py +index 4b0d115e6..88c0400f2 100644 +--- a/examples/offline_inference/multilora_inference.py ++++ b/examples/offline_inference/multilora_inference.py +@@ -91,7 +91,9 @@ def initialize_engine() -> LLMEngine: + max_loras=1, + max_lora_rank=8, + max_cpu_loras=2, +- max_num_seqs=256) ++ max_num_seqs=256, ++ enforce_eager=True, ++ block_size=64) + return LLMEngine.from_engine_args(engine_args) + + +diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py +index 61d53dda1..9f3d4242c 100644 +--- a/examples/offline_inference/vision_language.py ++++ b/examples/offline_inference/vision_language.py +@@ -854,7 +854,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: + # Qwen2.5-VL + def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: + +- model_name = "Qwen/Qwen2.5-VL-3B-Instruct" ++ model_name = "Qwen/Qwen2.5-VL-7B-Instruct" + + engine_args = EngineArgs( + model=model_name, +@@ -865,6 +865,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, ++ enforce_eager=True, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + +diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py +index e03ebe485..d3338f4eb 100644 +--- a/examples/offline_inference/vision_language_multi_image.py ++++ b/examples/offline_inference/vision_language_multi_image.py +@@ -552,12 +552,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: + '`pip install qwen-vl-utils`.') + process_vision_info = None + +- model_name = "Qwen/Qwen2.5-VL-3B-Instruct" ++ model_name = "Qwen/Qwen2.5-VL-7B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=32768 if process_vision_info is None else 4096, +- max_num_seqs=5, ++ max_num_seqs=2, ++ enforce_eager=True, ++ gpu_memory_utilization=0.8, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + +diff --git a/requirements/common.txt b/requirements/common.txt +index 24a1e6d67..a247d182c 100644 +--- a/requirements/common.txt ++++ b/requirements/common.txt +@@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11 + llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" + outlines == 0.1.11 + lark == 1.2.2 +-xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64" ++xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64" + typing_extensions >= 4.10 + filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 + partial-json-parser # used for parsing partial JSON outputs +diff --git a/requirements/xpu.txt b/requirements/xpu.txt +index fa09004d0..7bd3261eb 100644 +--- a/requirements/xpu.txt ++++ b/requirements/xpu.txt +@@ -9,15 +9,14 @@ setuptools>=75.8.0 + wheel + jinja2>=3.1.6 + datasets # for benchmark scripts ++numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding + +-torch==2.6.0+xpu ++torch == 2.7.0 + torchaudio + torchvision +-pytorch-triton-xpu + --extra-index-url=https://download.pytorch.org/whl/xpu + +-# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu +-# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. +-# intel-extension-for-pytorch==2.6.10+xpu +-oneccl_bind_pt==2.6.0+xpu ++ ++intel-extension-for-pytorch @ https://download.pytorch-extension.intel.com/ipex_dev/xpu/intel_extension_for_pytorch-2.7.0.post0%2Bxpu-cp310-cp310-linux_x86_64.whl ++oneccl_bind_pt==2.7.0+xpu + --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +diff --git a/run_benchmark_with_profile.sh b/run_benchmark_with_profile.sh +new file mode 100644 +index 000000000..fe4dbc268 +--- /dev/null ++++ b/run_benchmark_with_profile.sh +@@ -0,0 +1,3 @@ ++export VLLM_TORCH_PROFILER_DIR=$PWD/profile ++ ++VLLM_USE_V1=1 python3 benchmarks/benchmark_throughput.py --model facebook/opt-125m --dataset_name random --enforce-eager --max-num-seqs 32 --gpu-memory-util 0.8 --num-prompts 16 --max-model-len 2000 --input-len 1024 --output-len 10 --max-num-batched-tokens 32768 --disable-sliding-window --dtype float16 --profile +\ No newline at end of file +diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py +index 19d16713b..5e11af8cf 100644 +--- a/tests/entrypoints/openai/test_serving_chat.py ++++ b/tests/entrypoints/openai/test_serving_chat.py +@@ -272,3 +272,43 @@ def test_serving_chat_could_load_correct_generation_config(): + + assert mock_engine.generate.call_args.args[1].temperature == 0.0 + assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05 ++ ++ ++def test_serving_chat_did_set_correct_cache_salt(): ++ mock_model_config = MockModelConfig() ++ ++ mock_engine = MagicMock(spec=MQLLMEngineClient) ++ mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) ++ mock_engine.errored = False ++ ++ # Initialize the serving chat ++ models = OpenAIServingModels(engine_client=mock_engine, ++ base_model_paths=BASE_MODEL_PATHS, ++ model_config=mock_model_config) ++ serving_chat = OpenAIServingChat(mock_engine, ++ mock_model_config, ++ models, ++ response_role="assistant", ++ chat_template=CHAT_TEMPLATE, ++ chat_template_content_format="auto", ++ request_logger=None) ++ ++ # Test cache_salt ++ req = ChatCompletionRequest( ++ model=MODEL_NAME, ++ messages=[{ ++ "role": "user", ++ "content": "what is 1+1?" ++ }], ++ ) ++ ++ # By default cache_salt in the engine prompt is not set ++ with suppress(Exception): ++ asyncio.run(serving_chat.create_chat_completion(req)) ++ assert "cache_salt" not in mock_engine.generate.call_args.args[0] ++ ++ # Test with certain cache_salt ++ req.cache_salt = "test_salt" ++ with suppress(Exception): ++ asyncio.run(serving_chat.create_chat_completion(req)) ++ assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt" +diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py +index 0e3913676..be861cd96 100644 +--- a/tests/quantization/test_ipex_quant.py ++++ b/tests/quantization/test_ipex_quant.py +@@ -24,7 +24,8 @@ DTYPE = ["bfloat16"] + @pytest.mark.parametrize("model", MODELS) + @pytest.mark.parametrize("dtype", DTYPE) + def test_ipex_quant(vllm_runner, model, dtype): +- with vllm_runner(model, dtype=dtype) as llm: ++ with vllm_runner(model, dtype=dtype, enforce_eager=True, ++ block_size=32) as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + assert output +diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py +index b1860e0bb..d1c4017f1 100644 +--- a/tests/tokenization/test_detokenize.py ++++ b/tests/tokenization/test_detokenize.py +@@ -38,26 +38,45 @@ TOKENIZERS = [ + ] + + +-def _run_incremental_decode(tokenizer, all_input_ids, +- skip_special_tokens: bool, starting_index: int): +- decoded_text = "" +- offset = 0 +- token_offset = 0 +- prev_tokens = None +- for i in range(starting_index, len(all_input_ids)): +- new_tokens, text, offset, token_offset = detokenize_incrementally( +- tokenizer, +- all_input_ids[:i + 1], +- prev_tokens, +- offset, +- token_offset, +- skip_special_tokens=skip_special_tokens) +- decoded_text += text +- if prev_tokens is None: +- prev_tokens = new_tokens +- else: +- prev_tokens += new_tokens +- return decoded_text ++def _run_incremental_decode(tokenizer, ++ all_input_ids, ++ skip_special_tokens: bool, ++ starting_index: int, ++ spaces_between_special_tokens: bool = True, ++ fast: Optional[bool] = None): ++ ++ prompt_token_ids = all_input_ids[:starting_index] ++ ++ params = SamplingParams( ++ skip_special_tokens=skip_special_tokens, ++ spaces_between_special_tokens=spaces_between_special_tokens, ++ ) ++ request = EngineCoreRequest("", ++ prompt_token_ids, ++ None, ++ None, ++ None, ++ params, ++ None, ++ 0.0, ++ None, ++ cache_salt=None) ++ ++ if fast is None: ++ detokenizer = IncrementalDetokenizer.from_new_request( ++ tokenizer, request) ++ elif fast: ++ detokenizer = FastIncrementalDetokenizer(tokenizer, request) ++ else: ++ detokenizer = SlowIncrementalDetokenizer(tokenizer, request) ++ ++ output_text = "" ++ for i, token_id in enumerate(all_input_ids[starting_index:]): ++ detokenizer.update([token_id], False) ++ finished = i == len(all_input_ids) - 1 ++ output_text += detokenizer.get_next_output_text(finished, delta=True) ++ ++ return output_text, detokenizer.output_token_ids + + + @pytest.fixture +diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py +index 8362af24a..709ff1e4e 100644 +--- a/tests/v1/core/test_kv_cache_utils.py ++++ b/tests/v1/core/test_kv_cache_utils.py +@@ -26,7 +26,8 @@ from vllm.v1.request import Request + def make_request(request_id, + prompt_token_ids, + mm_positions=None, +- mm_hashes=None): ++ mm_hashes=None, ++ cache_salt=None): + if mm_positions is None: + multi_modal_inputs = None + else: +@@ -43,6 +44,7 @@ def make_request(request_id, + eos_token_id=100, + arrival_time=0, + lora_request=None, ++ cache_salt=cache_salt, + ) + + +@@ -202,6 +204,45 @@ def test_generate_block_hash_extra_keys_no_mm_inputs(): + assert next_mm_idx == 0 + + ++def test_generate_block_hash_extra_keys_cache_salt(): ++ request = make_request( ++ request_id=0, ++ prompt_token_ids=[_ for _ in range(6)], ++ mm_positions=None, ++ mm_hashes=None, ++ cache_salt="salt", ++ ) ++ ++ # salt is added for the first token ++ extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0) ++ assert extra_keys == ('salt', ) ++ extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0) ++ assert extra_keys == ('salt', ) ++ ++ # no salt added for other tokens ++ extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0) ++ assert extra_keys is None ++ extra_keys, _ = generate_block_hash_extra_keys(request, 6, 10, 0) ++ assert extra_keys is None ++ ++ # works together with other extra keys ++ request_mm = make_request( ++ request_id=0, ++ prompt_token_ids=[_ for _ in range(20)], ++ mm_positions=[ ++ PlaceholderRange(offset=0, length=5), ++ ], ++ mm_hashes=["hash1"], ++ cache_salt="salt", ++ ) ++ ++ # Test with no extra keys ++ extra_keys, next_mm_idx = generate_block_hash_extra_keys( ++ request_mm, 0, 5, 0) ++ assert extra_keys == ("hash1", "salt") ++ assert next_mm_idx == 1 ++ ++ + @pytest.mark.parametrize("hash_fn", [sha256, hash]) + def test_hash_block_tokens(hash_fn): + parent_block_hash = 123 +diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py +index 80dd275a9..84d64ac4b 100644 +--- a/tests/v1/core/test_prefix_caching.py ++++ b/tests/v1/core/test_prefix_caching.py +@@ -21,7 +21,8 @@ def make_request(request_id, + prompt_token_ids, + mm_positions=None, + mm_hashes=None, +- prompt_logprobs: Optional[int] = None): ++ prompt_logprobs: Optional[int] = None, ++ cache_salt: Optional[str] = None): + if mm_positions is None: + multi_modal_inputs = None + else: +@@ -39,6 +40,7 @@ def make_request(request_id, + eos_token_id=100, + arrival_time=0, + lora_request=None, ++ cache_salt=cache_salt, + ) + + +@@ -655,6 +657,66 @@ def test_mm_prefix_caching(): + assert num_computed_tokens == 3 * 16 + + ++def test_cache_key_salting(): ++ """ ++ This tests that cache salts are applied during hashing and the cache ++ is separated cache as expected. ++ """ ++ block_size = 16 ++ manager = KVCacheManager( ++ make_kv_cache_config(block_size, 11), ++ max_model_len=8192, ++ enable_caching=True, ++ ) ++ ++ # 3 complete blocks and an incomplete block with 11 tokens. ++ common_token_ids = [i for i in range(3) for _ in range(block_size)] ++ token_ids = common_token_ids + [3] * 11 ++ req0 = make_request("0", token_ids, cache_salt="salt1") ++ computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) ++ ++ # Completed block should have hashes with extra keys. ++ assert not computed_blocks ++ assert num_computed_tokens == 0 ++ block_hashes = manager.req_to_block_hashes[req0.request_id] ++ assert len(block_hashes) == 3 ++ assert block_hashes[0].extra_keys == ("salt1", ) ++ assert block_hashes[1].extra_keys is None ++ assert block_hashes[2].extra_keys is None ++ ++ blocks = manager.allocate_slots(req0, 59, computed_blocks) ++ assert [b.block_id for b in blocks] == [1, 2, 3, 4] ++ req0.num_computed_tokens = 59 ++ ++ # Append slots without allocating a new block. ++ for _ in range(5): ++ req0.append_output_token_ids(8) ++ new_blocks = manager.allocate_slots(req0, 5) ++ assert new_blocks is not None and len(new_blocks) == 0 ++ ++ # Now one more block that should not have extra keys. ++ assert len(block_hashes) == 4 ++ assert block_hashes[3].extra_keys is None ++ ++ # Test cache hit with a new request that has the same salt. ++ token_ids = common_token_ids + [4] * 11 ++ req1 = make_request("1", token_ids, cache_salt="salt1") ++ computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) ++ # Should match only a prefix of 3 blocks. ++ assert len(computed_blocks) == 3 ++ assert num_computed_tokens == 3 * block_size ++ ++ # Test cache miss with same content but different salt. ++ token_ids = common_token_ids + [4] * 11 ++ req2 = make_request("2", token_ids, cache_salt="salt2") ++ computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) ++ assert len(computed_blocks) == 0 ++ assert num_computed_tokens == 0 ++ block_hashes = manager.req_to_block_hashes[req2.request_id] ++ assert len(block_hashes) == 3 ++ assert block_hashes[0].extra_keys == ("salt2", ) ++ ++ + def test_prefill_not_enough_free_blocks_with_computed_blocks(): + """ + This is a unit test that tests the correctness of the allocate_slots +diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py +index a125d3fb7..5e85a46ae 100644 +--- a/tests/v1/e2e/test_correctness_sliding_window.py ++++ b/tests/v1/e2e/test_correctness_sliding_window.py +@@ -25,7 +25,7 @@ model_config = { + "model", + [ + "bigcode/starcoder2-3b", # sliding window only +- "google/gemma-2-2b-it", # sliding window + full attention ++ #"google/gemma-2-2b-it", # sliding window + full attention + ]) + @pytest.mark.parametrize("batch_size", [5]) + @pytest.mark.parametrize("seed", [1]) +@@ -41,7 +41,7 @@ def test_sliding_window_retrival(monkeypatch, model, batch_size, seed): + + test_config = model_config[model] + +- llm = LLM(model=model) ++ llm = LLM(model=model, enforce_eager=True, block_size=64) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + + prompts, answer, indices = prep_prompts(batch_size, +diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py +index 7c7c2f02c..e6a844aba 100644 +--- a/tests/v1/e2e/test_ngram_spec_decode.py ++++ b/tests/v1/e2e/test_ngram_spec_decode.py +@@ -66,7 +66,7 @@ def test_ngram_correctness( + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + +- ref_llm = LLM(model=model_name, max_model_len=1024) ++ ref_llm = LLM(model=model_name, enforce_eager=True, block_size=32, dtype="float16") + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + +@@ -79,11 +79,17 @@ def test_ngram_correctness( + "num_speculative_tokens": 3, + }, + max_model_len=1024, ++ enforce_eager=True, ++ block_size=64, ++ dtype="float16", ++ gpu_memory_utilization=0.6, + ) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): ++ print(ref_output.outputs[0].text) ++ print(spec_output.outputs[0].text) + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: +diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py +index 3f3109c14..28f45b849 100644 +--- a/tests/v1/engine/test_engine_core.py ++++ b/tests/v1/engine/test_engine_core.py +@@ -42,6 +42,7 @@ def make_request() -> EngineCoreRequest: + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, ++ cache_salt=None, + ) + + +diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py +index 68844b877..32a0856f0 100644 +--- a/tests/v1/engine/test_engine_core_client.py ++++ b/tests/v1/engine/test_engine_core_client.py +@@ -42,6 +42,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, ++ cache_salt=None, + ) + + +diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py +index 9ac42dbc3..b927edeab 100644 +--- a/tests/v1/engine/test_output_processor.py ++++ b/tests/v1/engine/test_output_processor.py +@@ -58,6 +58,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, ++ cache_salt=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, +@@ -406,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, ++ cache_salt=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, +@@ -507,7 +509,7 @@ def test_stop_token(include_stop_str_in_output: bool, + reason should be "stop" (i.e. first control token causes stop + and is represented in output text) + +- * else, the detokenized string should be ++ * else, the detokenized string should be + ... and the finish reason should be "stop" + (i.e. first control token causes stop but is not represented + in output text.) +@@ -570,6 +572,7 @@ def test_stop_token(include_stop_str_in_output: bool, + mm_placeholders=None, + eos_token_id=eos_token_id, + lora_request=None, ++ cache_salt=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, +@@ -667,6 +670,7 @@ def test_stop_string(include_stop_str_in_output: bool, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, ++ cache_salt=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, +@@ -782,6 +786,7 @@ def test_iteration_stats(dummy_test_vectors): + mm_placeholders=None, + eos_token_id=None, + lora_request=None, ++ cache_salt=None, + sampling_params=SamplingParams(), + ) for idx, (prompt, prompt_tokens) in enumerate( + zip(dummy_test_vectors.prompt_strings, +diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py +index 719e02ecd..f770b3bae 100644 +--- a/vllm/_custom_ops.py ++++ b/vllm/_custom_ops.py +@@ -14,7 +14,8 @@ from vllm.scalar_type import ScalarType + + logger = init_logger(__name__) + +-if not current_platform.is_tpu() and not current_platform.is_hpu(): ++if not current_platform.is_tpu() and not current_platform.is_hpu()\ ++ and not current_platform.is_xpu(): + try: + import vllm._C + except ImportError as e: +@@ -981,7 +982,10 @@ def scaled_fp8_quant( + output, input, scale, scale_ub) + else: + scale = torch.zeros(1, device=input.device, dtype=torch.float32) +- torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) ++ if current_platform.is_xpu(): ++ torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale) ++ else: ++ torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) + else: + # num_token_padding not implemented for this case + assert (scale.numel() == 1 or num_token_padding is None) +diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py +index c3d210c27..620fd54f3 100644 +--- a/vllm/_ipex_ops.py ++++ b/vllm/_ipex_ops.py +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: Apache-2.0 + +-from typing import Optional ++from typing import Optional, List + + import torch + +@@ -177,6 +177,7 @@ class ipex_ops: + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, ++ alibi_slopes: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, +@@ -185,6 +186,8 @@ class ipex_ops: + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, ++ window_size_left: float, ++ window_size_right: float, + logits_soft_cap: float, + ) -> None: + if ipex.__version__.endswith("cpu"): +@@ -200,15 +203,12 @@ class ipex_ops: + is_causal, return_softmax, + gen_) + else: # XPU build +- ipex.llm.functional.varlen_attention(query.contiguous(), +- key.contiguous(), +- value.contiguous(), out, +- seqlen_q.int(), +- seqlen_k.int(), max_seqlen_q, +- max_seqlen_k, pdropout, +- softmax_scale, zero_tensors, +- is_causal, return_softmax, +- gen_, logits_soft_cap) ++ ipex.llm.functional.varlen_attention( ++ query.contiguous(), key.contiguous(), value.contiguous(), out, ++ seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q, ++ max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal, ++ return_softmax, gen_, window_size_left, window_size_right, ++ logits_soft_cap) + + @staticmethod + def reshape_and_cache( +@@ -225,6 +225,64 @@ class ipex_ops: + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping) + ++ @staticmethod ++ def reshape_and_cache_flash( ++ key: torch.Tensor, ++ value: torch.Tensor, ++ key_cache: torch.Tensor, ++ value_cache: torch.Tensor, ++ slot_mapping: torch.Tensor, ++ kv_cache_dtype: str, ++ k_scale: float, ++ v_scale: float, ++ ) -> None: ++ ipex.llm.modules.PagedAttention.reshape_and_cache_flash( ++ key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, ++ k_scale, v_scale) ++ ++ @staticmethod ++ def chunked_prefill( ++ query: torch.Tensor, ++ key_cache: torch.Tensor, ++ value_cache: torch.Tensor, ++ output: torch.Tensor, ++ cu_seqlens_q: torch.Tensor, ++ cu_seqlens_k: torch.Tensor, ++ seq_used_k: Optional[torch.Tensor], ++ block_table: torch.Tensor, ++ alibi_slopes: Optional[torch.Tensor], ++ max_seqlen_q: int, ++ max_seqlen_k: int, ++ p_dropout: float, ++ softmax_scale: float, ++ zero_tensors: bool, ++ window_size_left: int, ++ window_size_right: int, ++ is_causal: bool, ++ return_softmax: bool, ++ gen_: Optional[torch.Generator], ++ kv_cache_dtype: str, ++ ): ++ return ipex.llm.modules.PagedAttention.flash_attn_varlen_func( ++ output, ++ query.contiguous(), ++ key_cache, ++ value_cache, ++ cu_seqlens_q, ++ cu_seqlens_k, ++ max_seqlen_q, ++ max_seqlen_k, ++ softmax_scale, ++ is_causal, ++ block_table, ++ alibi_slopes, ++ kv_cache_dtype=kv_cache_dtype, # "fp8" ++ window_size_left=window_size_left, ++ window_size_right=window_size_right, ++ k_scale=1.0, ++ v_scale=1.0, ++ ) ++ + @staticmethod + def copy_blocks(key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], +@@ -239,3 +297,125 @@ class ipex_ops: + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: torch.Tensor) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) # type: ignore ++ ++ @staticmethod ++ def bgmv_shrink(inputs: torch.Tensor, ++ lora_a_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ scaling: float = 1.0) -> None: ++ ipex.llm.functional.bgmv_shrink(inputs, lora_a_weights, output_tensor, ++ lora_indices_tensor, scaling) ++ ++ @staticmethod ++ def bgmv_expand(inputs: torch.Tensor, ++ lora_b_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ add_inputs: bool = True) -> None: ++ ipex.llm.functional.bgmv_expand(inputs, lora_b_weights, output_tensor, ++ lora_indices_tensor, add_inputs) ++ ++ @staticmethod ++ def bgmv_expand_slice(inputs: torch.Tensor, ++ lora_b_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ slice_offset: int, ++ slice_size: int, ++ add_inputs: bool = True) -> None: ++ ipex.llm.functional.bgmv_expand_slice(inputs, lora_b_weights, ++ output_tensor, ++ lora_indices_tensor, ++ slice_offset, slice_size, ++ add_inputs) ++ ++ @staticmethod ++ def sgmv_shrink(inputs: torch.Tensor, ++ lora_a_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ b_seq_start_loc: torch.Tensor, ++ seq_len_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ batches: int, ++ max_seq_length: int, ++ token_nums: int, ++ scaling: float = 1.0) -> None: ++ assert inputs.size(0) == token_nums ++ ipex.llm.functional.sgmv_shrink(inputs, lora_a_weights, output_tensor, ++ b_seq_start_loc, seq_len_tensor, ++ lora_indices_tensor, batches, ++ max_seq_length, scaling) ++ ++ @staticmethod ++ def sgmv_expand(inputs: torch.Tensor, ++ lora_b_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ b_seq_start_loc: torch.Tensor, ++ seq_len_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ batches: int, ++ max_seq_length: int, ++ token_nums: int, ++ add_inputs: bool = False) -> None: ++ assert inputs.size(0) == token_nums ++ ipex.llm.functional.sgmv_expand(inputs, lora_b_weights, output_tensor, ++ b_seq_start_loc, seq_len_tensor, ++ lora_indices_tensor, batches, ++ max_seq_length, add_inputs) ++ ++ @staticmethod ++ def sgmv_expand_slice(inputs: torch.Tensor, ++ lora_b_weights: torch.Tensor, ++ output_tensor: torch.Tensor, ++ b_seq_start_loc: torch.Tensor, ++ seq_len_tensor: torch.Tensor, ++ lora_indices_tensor: torch.Tensor, ++ batches: int, ++ max_seq_length: int, ++ token_nums: int, ++ slice_offset: int, ++ slice_size: int, ++ add_inputs: bool = False) -> None: ++ assert inputs.size(0) == token_nums ++ ipex.llm.functional.sgmv_expand_slice(inputs, lora_b_weights, ++ output_tensor, b_seq_start_loc, ++ seq_len_tensor, ++ lora_indices_tensor, batches, ++ max_seq_length, slice_offset, ++ slice_size, add_inputs) ++ ++ # @staticmethod ++ # def lora_expand(inputs: torch.Tensor, ++ # lora_b_weights: List[torch.Tensor], ++ # output_tensor: torch.Tensor, ++ # token_lora_mapping: torch.Tensor, ++ # token_indices_sorted_by_lora_ids: torch.Tensor, ++ # num_tokens_per_lora: torch.Tensor, ++ # lora_token_start_loc: torch.Tensor, ++ # lora_ids: torch.Tensor, ++ # offset_start: int = 0, ++ # add_inputs: bool = False) -> None: ++ # ipex.llm.functional.lora_expand(inputs, lora_b_weights, ++ # output_tensor, token_lora_mapping, ++ # token_indices_sorted_by_lora_ids, ++ # num_tokens_per_lora, num_tokens_per_lora, ++ # lora_token_start_loc, lora_ids, ++ # offset_start, add_inputs) ++ ++ # @staticmethod ++ # def lora_shrink(inputs: torch.Tensor, ++ # lora_a_weights: List[torch.Tensor], ++ # output_tensor: torch.Tensor, ++ # token_lora_mapping: torch.Tensor, ++ # token_indices_sorted_by_lora_ids: torch.Tensor, ++ # num_tokens_per_lora: torch.Tensor, ++ # lora_token_start_loc: torch.Tensor, ++ # lora_ids: torch.Tensor, ++ # scaling: float) -> None: ++ # ipex.llm.functional.lora_shrink(inputs, lora_a_weights, ++ # output_tensor, token_lora_mapping, ++ # token_indices_sorted_by_lora_ids, ++ # num_tokens_per_lora, num_tokens_per_lora, ++ # lora_token_start_loc, lora_ids, ++ # scaling) +diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py +index d3c61ea26..09b2540b1 100644 +--- a/vllm/attention/backends/ipex_attn.py ++++ b/vllm/attention/backends/ipex_attn.py +@@ -135,10 +135,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads +- self.need_mask = (self.alibi_slopes is not None +- or self.sliding_window is not None) ++ self.need_mask = (self.sliding_window is not None) + if logits_soft_cap is None: +- logits_soft_cap = 0 ++ logits_soft_cap = -1 + self.logits_soft_cap = logits_soft_cap + + supported_head_sizes = PagedAttention.get_supported_head_sizes() +@@ -212,8 +211,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, +- layer._k_scale, +- layer._v_scale, ++ layer._k_scale_float, ++ layer._v_scale_float, + ) + + if attn_metadata.is_prompt: +@@ -226,11 +225,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + dim=1) + + if attn_metadata.attn_bias is None: +- if self.alibi_slopes is not None: +- att_masks = _make_alibi_bias( +- self.alibi_slopes, query.dtype, +- attn_metadata.seq_lens) # type: ignore +- elif self.sliding_window is not None: ++ if self.sliding_window is not None: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore +@@ -250,6 +245,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, ++ self.alibi_slopes, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, +@@ -258,6 +254,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + is_causal=True, + return_softmax=False, + gen_=None, ++ window_size_left=-1, ++ window_size_right=-1, + logits_soft_cap=self.logits_soft_cap, + ) + else: +@@ -298,8 +296,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, +- layer._k_scale, +- layer._v_scale, ++ layer._k_scale_float, ++ layer._v_scale_float, + ) + else: + # Run PagedAttention V2. +@@ -331,8 +329,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, +- layer._k_scale, +- layer._v_scale, ++ layer._k_scale_float, ++ layer._v_scale_float, + ) + + # Reshape the output tensor. +diff --git a/vllm/config.py b/vllm/config.py +index bd52fc90b..71dcb40ba 100644 +--- a/vllm/config.py ++++ b/vllm/config.py +@@ -636,7 +636,7 @@ class ModelConfig: + supported_quantization = QUANTIZATION_METHODS + optimized_quantization_methods = [ + "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", +- "awq_marlin", "fbgemm_fp8", "compressed_tensors", ++ "awq_marlin", "fbgemm_fp8", "compressed_tensors", "ipex", + "compressed-tensors", "experts_int8", "quark", "nvfp4" + ] + if self.quantization is not None: +@@ -1195,6 +1195,7 @@ class CacheConfig: + # Will be set after profiling. + self.num_gpu_blocks: Optional[int] = None + self.num_cpu_blocks: Optional[int] = None ++ self.threshold_mem: Optional[int] = None + + # Set calculate_kv_scales to False if the value is unset. + if self.calculate_kv_scales is None: +diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py +index 89c9b6747..02b0ef0be 100644 +--- a/vllm/engine/arg_utils.py ++++ b/vllm/engine/arg_utils.py +@@ -1409,6 +1409,8 @@ class EngineArgs: + and not envs.is_set("VLLM_ATTENTION_BACKEND") + ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" + supported = False ++ if current_platform.is_xpu(): ++ supported = True + if fp8_attention and will_use_fa: + from vllm.vllm_flash_attn.fa_utils import ( + flash_attn_supports_fp8) +@@ -1496,7 +1498,7 @@ class EngineArgs: + # No FlashInfer or XFormers so far. + V1_BACKENDS = [ + "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", +- "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA" ++ "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA", "IPEX_V1" + ] + if (envs.is_set("VLLM_ATTENTION_BACKEND") + and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): +diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py +index 7cbd9d7ce..7c26528a9 100644 +--- a/vllm/entrypoints/openai/protocol.py ++++ b/vllm/entrypoints/openai/protocol.py +@@ -13,6 +13,7 @@ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, + ValidationInfo, field_validator, model_validator) + from typing_extensions import TypeAlias + ++from vllm import envs + from vllm.entrypoints.chat_utils import ChatCompletionMessageParam + from vllm.logger import init_logger + from vllm.pooling_params import PoolingParams +@@ -384,6 +385,15 @@ class ChatCompletionRequest(OpenAIBaseModel): + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified.")) ++ cache_salt: Optional[str] = Field( ++ default=None, ++ description=( ++ "If specified, the prefix cache will be salted with the provided " ++ "string to prevent an attacker to guess prompts in multi-user " ++ "environments. The salt should be random, protected from " ++ "access by 3rd parties, and long enough to be " ++ "unpredictable (e.g., 43 characters base64-encoded, corresponding " ++ "to 256 bit). Not supported by vLLM engine V0.")) + + # doc: end-chat-completion-extra-params + +@@ -697,6 +707,20 @@ class ChatCompletionRequest(OpenAIBaseModel): + "`add_generation_prompt` to True.") + return data + ++ @model_validator(mode="before") ++ @classmethod ++ def check_cache_salt_support(cls, data): ++ if data.get("cache_salt") is not None: ++ if not envs.VLLM_USE_V1: ++ raise ValueError( ++ "Parameter 'cache_salt' is not supported with " ++ "this instance of vLLM, which uses engine V0.") ++ if not isinstance(data["cache_salt"], ++ str) or not data["cache_salt"]: ++ raise ValueError("Parameter 'cache_salt' must be a " ++ "non-empty string if provided.") ++ return data ++ + + class CompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation +@@ -1597,13 +1621,50 @@ class TranscriptionRequest(OpenAIBaseModel): + """ + + stream: Optional[bool] = False +- """Custom field not present in the original OpenAI definition. When set, ++ """Custom field not present in the original OpenAI definition. When set, + it will enable output to be streamed in a similar fashion as the Chat +- Completion endpoint. ++ Completion endpoint. + """ + # Flattened stream option to simplify form data. + stream_include_usage: Optional[bool] = False + stream_continuous_usage_stats: Optional[bool] = False ++ # doc: end-transcription-extra-params ++ ++ # doc: begin-transcription-sampling-params ++ temperature: float = Field(default=0.0) ++ """The sampling temperature, between 0 and 1. ++ ++ Higher values like 0.8 will make the output more random, while lower values ++ like 0.2 will make it more focused / deterministic. If set to 0, the model ++ will use [log probability](https://en.wikipedia.org/wiki/Log_probability) ++ to automatically increase the temperature until certain thresholds are hit. ++ """ ++ ++ top_p: Optional[float] = None ++ """Enables nucleus (top-p) sampling, where tokens are selected from the ++ smallest possible set whose cumulative probability exceeds `p`. ++ """ ++ ++ top_k: Optional[int] = None ++ """Limits sampling to the `k` most probable tokens at each step.""" ++ ++ min_p: Optional[float] = None ++ """Filters out tokens with a probability lower than `min_p`, ensuring a ++ minimum likelihood threshold during sampling. ++ """ ++ ++ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) ++ """The seed to use for sampling.""" ++ ++ frequency_penalty: Optional[float] = 0.0 ++ """The frequency penalty to use for sampling.""" ++ ++ repetition_penalty: Optional[float] = None ++ """The repetition penalty to use for sampling.""" ++ ++ presence_penalty: Optional[float] = 0.0 ++ """The presence penalty to use for sampling.""" ++ # doc: end-transcription-sampling-params + + # Default sampling parameters for transcription requests. + _DEFAULT_SAMPLING_PARAMS: dict = { +diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py +index bbc8eddd8..d31dbb3b2 100644 +--- a/vllm/entrypoints/openai/serving_engine.py ++++ b/vllm/entrypoints/openai/serving_engine.py +@@ -458,6 +458,9 @@ class OpenAIServing: + if request.mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + ++ if hasattr(request, "cache_salt") and request.cache_salt is not None: ++ engine_prompt["cache_salt"] = request.cache_salt ++ + return conversation, [request_prompt], [engine_prompt] + + def _log_inputs( +diff --git a/vllm/envs.py b/vllm/envs.py +index 6067f5bdd..4f0909105 100644 +--- a/vllm/envs.py ++++ b/vllm/envs.py +@@ -106,6 +106,8 @@ if TYPE_CHECKING: + VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False + VLLM_TPU_BUCKET_PADDING_GAP: int = 0 + VLLM_USE_DEEP_GEMM: bool = False ++ VLLM_XPU_FP8_DTYPE: str = "e5m2" ++ VLLM_XGRAMMAR_CACHE_MB: int = 0 + + + def get_default_cache_root(): +@@ -692,6 +694,15 @@ environment_variables: dict[str, Callable[[], Any]] = { + # Allow use of DeepGemm kernels for fused moe ops. + "VLLM_USE_DEEP_GEMM": + lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), ++ ++ # fp8 dtype for XPU platform ++ "VLLM_XPU_FP8_DTYPE": ++ lambda: os.environ.get("VLLM_XPU_FP8_DTYPE", "e5m2"), ++ # Control the cache sized used by the xgrammar compiler. The default ++ # of 512 MB should be enough for roughly 1000 JSON schemas. ++ # It can be changed with this variable if needed for some reason. ++ "VLLM_XGRAMMAR_CACHE_MB": ++ lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")), + } + + # end-env-vars-definition +diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py +index 9b0b98731..2c91b85e2 100644 +--- a/vllm/executor/ray_distributed_executor.py ++++ b/vllm/executor/ray_distributed_executor.py +@@ -72,7 +72,7 @@ class RayDistributedExecutor(DistributedExecutorBase): + + def _init_executor(self) -> None: + self.forward_dag: Optional[ray.dag.CompiledDAG] = None +- if envs.VLLM_USE_V1: ++ if envs.VLLM_USE_V1 and not current_platform.is_xpu(): + # V1 uses SPMD worker and compiled DAG + os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" + os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" +diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py +index 138a8f611..aefdf52df 100644 +--- a/vllm/inputs/data.py ++++ b/vllm/inputs/data.py +@@ -34,6 +34,11 @@ class TextPrompt(TypedDict): + to pass the mm_processor_kwargs to each of them. + """ + ++ cache_salt: NotRequired[str] ++ """ ++ Optional cache salt to be used for prefix caching. ++ """ ++ + + class TokensPrompt(TypedDict): + """Schema for a tokenized prompt.""" +@@ -58,6 +63,11 @@ class TokensPrompt(TypedDict): + to pass the mm_processor_kwargs to each of them. + """ + ++ cache_salt: NotRequired[str] ++ """ ++ Optional cache salt to be used for prefix caching. ++ """ ++ + + SingletonPrompt = Union[str, TextPrompt, TokensPrompt] + """ +@@ -176,6 +186,10 @@ class TokenInputs(TypedDict): + have registered mappers etc for the model being considered, we attempt + to pass the mm_processor_kwargs to each of them. + """ ++ cache_salt: NotRequired[str] ++ """ ++ Optional cache salt to be used for prefix caching. ++ """ + + + def token_inputs( +@@ -187,6 +201,7 @@ def token_inputs( + multi_modal_hashes: Optional[list[str]] = None, + multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ++ cache_salt: Optional[str] = None, + ) -> TokenInputs: + """Construct :class:`TokenInputs` from optional values.""" + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) +@@ -205,6 +220,8 @@ def token_inputs( + inputs["multi_modal_placeholders"] = multi_modal_placeholders + if mm_processor_kwargs is not None: + inputs["mm_processor_kwargs"] = mm_processor_kwargs ++ if cache_salt is not None: ++ inputs["cache_salt"] = cache_salt + + return inputs + +@@ -374,7 +391,7 @@ def zip_enc_dec_prompts( + """ + Zip encoder and decoder prompts together into a list of + :class:`ExplicitEncoderDecoderPrompt` instances. +- ++ + ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. +diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py +index 669fb96e6..2aa624c54 100644 +--- a/vllm/inputs/preprocess.py ++++ b/vllm/inputs/preprocess.py +@@ -17,7 +17,8 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup + + from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, + PromptType, SingletonInputs, SingletonPrompt, token_inputs) +-from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt ++from .parse import (ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt, ++ is_explicit_encoder_decoder_prompt, parse_singleton_prompt) + + logger = init_logger(__name__) + +@@ -302,6 +303,29 @@ class InputPreprocessor: + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs, + return_mm_hashes) + ++ def _get_prompt_data(self, parsed_prompt: Union[ParsedStrPrompt, ++ ParsedTextPrompt, ++ ParsedTokensPrompt]): ++ prompt_text = None ++ prompt_token_ids = None ++ token_type_ids = None ++ cache_salt = None ++ ++ if parsed_prompt["type"] == "str": ++ prompt_text = parsed_prompt["content"] ++ else: ++ cache_salt = parsed_prompt["content"].get("cache_salt") ++ if parsed_prompt["type"] == "text": ++ prompt_text = parsed_prompt["content"]["prompt"] ++ elif parsed_prompt["type"] == "tokens": ++ prompt_token_ids = parsed_prompt["content"].get( ++ "prompt_token_ids") ++ token_type_ids = parsed_prompt["content"].get("token_type_ids") ++ else: ++ assert_never(parsed_prompt) ++ ++ return prompt_text, prompt_token_ids, token_type_ids, cache_salt ++ + def _prompt_to_llm_inputs( + self, + prompt: SingletonPrompt, +@@ -322,72 +346,35 @@ class InputPreprocessor: + * :class:`SingletonInputs` instance + """ + parsed = parse_singleton_prompt(prompt) +- +- if parsed["type"] == "str": +- prompt_text = parsed["content"] +- prompt_token_ids = self._tokenize_prompt( +- prompt_text, ++ prompt_text, prompt_token_ids, token_type_ids, cache_salt = \ ++ self._get_prompt_data(parsed) ++ ++ # If multimodal data is present, process and return immediately ++ if parsed["type"] != "str" and parsed["content"].get( ++ "multi_modal_data") is not None: ++ inputs = self._process_multimodal( ++ prompt_text if prompt_text is not None else prompt_token_ids, ++ parsed["content"]["multi_modal_data"], ++ parsed["content"].get("mm_processor_kwargs"), + lora_request=lora_request, ++ return_mm_hashes=return_mm_hashes, + ) ++ if cache_salt is not None: ++ inputs["cache_salt"] = cache_salt ++ return inputs + +- return token_inputs( +- prompt=prompt_text, +- prompt_token_ids=prompt_token_ids, +- ) +- +- if parsed["type"] == "tokens": +- tokens_content = parsed["content"] +- +- prompt_token_ids = tokens_content["prompt_token_ids"] +- token_type_ids = tokens_content.get("token_type_ids") +- multi_modal_data = tokens_content.get("multi_modal_data") +- mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") +- +- if multi_modal_data is not None and self._can_process_multimodal(): +- return self._process_multimodal( +- prompt_token_ids, +- multi_modal_data, +- mm_processor_kwargs, +- lora_request=lora_request, +- return_mm_hashes=return_mm_hashes, +- ) +- +- return token_inputs( +- prompt_token_ids=prompt_token_ids, +- token_type_ids=token_type_ids, +- multi_modal_data=multi_modal_data, +- mm_processor_kwargs=mm_processor_kwargs, +- ) +- +- if parsed["type"] == "text": +- text_content = parsed["content"] +- +- prompt_text = text_content["prompt"] +- multi_modal_data = text_content.get("multi_modal_data") +- mm_processor_kwargs = text_content.get("mm_processor_kwargs") +- +- if multi_modal_data is not None and self._can_process_multimodal(): +- return self._process_multimodal( +- prompt_text, +- multi_modal_data, +- mm_processor_kwargs, +- lora_request=lora_request, +- return_mm_hashes=return_mm_hashes, +- ) +- ++ if prompt_token_ids is None: + prompt_token_ids = self._tokenize_prompt( + prompt_text, + lora_request=lora_request, + ) + +- return token_inputs( +- prompt=prompt_text, +- prompt_token_ids=prompt_token_ids, +- multi_modal_data=multi_modal_data, +- mm_processor_kwargs=mm_processor_kwargs, +- ) +- +- assert_never(parsed) ++ return token_inputs( ++ prompt=prompt_text, ++ prompt_token_ids=prompt_token_ids, ++ token_type_ids=token_type_ids, ++ cache_salt=cache_salt, ++ ) + + async def _prompt_to_llm_inputs_async( + self, +@@ -398,69 +385,34 @@ class InputPreprocessor: + """Async version of :meth:`_extract_prompt_components`.""" + parsed = parse_singleton_prompt(prompt) + +- if parsed["type"] == "str": +- prompt_text = parsed["content"] +- prompt_token_ids = await self._tokenize_prompt_async( +- prompt_text, +- lora_request=lora_request, +- ) +- +- return token_inputs( +- prompt=prompt_text, +- prompt_token_ids=prompt_token_ids, +- ) +- +- if parsed["type"] == "tokens": +- tokens_content = parsed["content"] +- +- prompt_token_ids = tokens_content["prompt_token_ids"] +- multi_modal_data = tokens_content.get("multi_modal_data") +- mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") +- +- if multi_modal_data is not None and self._can_process_multimodal(): +- return await self._process_multimodal_async( +- prompt_token_ids, +- multi_modal_data, +- mm_processor_kwargs, +- lora_request=lora_request, +- return_mm_hashes=return_mm_hashes, +- ) ++ prompt_text, prompt_token_ids, token_type_ids, cache_salt = \ ++ self._get_prompt_data(parsed) + +- return token_inputs( +- prompt_token_ids=prompt_token_ids, +- multi_modal_data=multi_modal_data, +- mm_processor_kwargs=mm_processor_kwargs, ++ if parsed["type"] != "str" and parsed["content"].get( ++ "multi_modal_data") is not None: ++ inputs = await self._process_multimodal_async( ++ prompt_token_ids if prompt_text is None else prompt_text, ++ parsed["content"]["multi_modal_data"], ++ parsed["content"].get("mm_processor_kwargs"), ++ lora_request=lora_request, ++ return_mm_hashes=return_mm_hashes, + ) ++ if cache_salt is not None: ++ inputs["cache_salt"] = cache_salt ++ return inputs + +- if parsed["type"] == "text": +- text_content = parsed["content"] +- +- prompt_text = text_content["prompt"] +- multi_modal_data = text_content.get("multi_modal_data") +- mm_processor_kwargs = text_content.get("mm_processor_kwargs") +- +- if multi_modal_data is not None and self._can_process_multimodal(): +- return await self._process_multimodal_async( +- prompt_text, +- multi_modal_data, +- mm_processor_kwargs, +- lora_request=lora_request, +- return_mm_hashes=return_mm_hashes, +- ) +- ++ if prompt_token_ids is None: + prompt_token_ids = await self._tokenize_prompt_async( + prompt_text, + lora_request=lora_request, + ) + +- return token_inputs( +- prompt=prompt_text, +- prompt_token_ids=prompt_token_ids, +- multi_modal_data=multi_modal_data, +- mm_processor_kwargs=mm_processor_kwargs, +- ) +- +- assert_never(parsed) ++ return token_inputs( ++ prompt=prompt_text, ++ prompt_token_ids=prompt_token_ids, ++ token_type_ids=token_type_ids, ++ cache_salt=cache_salt, ++ ) + + def _build_enc_dec_llm_inputs( + self, +@@ -540,6 +492,11 @@ class InputPreprocessor: + mm_hashes=inputs["mm_hashes"], + mm_placeholders=inputs["mm_placeholders"], + ) ++ ++ cache_salt = inputs.get("cache_salt") ++ if cache_salt is not None: ++ decoder_inputs["cache_salt"] = cache_salt ++ + elif inputs["type"] == "token": + # Text-only inputs + encoder_inputs = token_inputs(prompt="", prompt_token_ids=[]) +diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py +new file mode 100644 +index 000000000..5f711bfe5 +--- /dev/null ++++ b/vllm/lora/punica.py +@@ -0,0 +1,616 @@ ++""" ++Based on: ++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). ++Punica: Multi-Tenant LoRA Serving. ++https://arxiv.org/abs/2310.18547 ++""" ++ ++from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union ++ ++import torch ++ ++from vllm.triton_utils import HAS_TRITON ++from vllm.utils import is_xpu ++ ++if HAS_TRITON and not is_xpu(): ++ from vllm.lora.ops.bgmv_expand import bgmv_expand ++ from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice ++ from vllm.lora.ops.bgmv_shrink import bgmv_shrink ++ from vllm.lora.ops.sgmv_expand import sgmv_expand ++ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice ++ from vllm.lora.ops.sgmv_shrink import sgmv_shrink ++elif is_xpu(): ++ from vllm._ipex_ops import ipex_ops ++ bgmv_expand = ipex_ops.bgmv_expand ++ bgmv_expand_slice = ipex_ops.bgmv_expand_slice ++ bgmv_shrink = ipex_ops.bgmv_shrink ++ sgmv_expand = ipex_ops.sgmv_expand ++ sgmv_expand_slice = ipex_ops.sgmv_expand_slice ++ sgmv_shrink = ipex_ops.sgmv_shrink ++ ++if TYPE_CHECKING: ++ # avoid circuit import ++ from vllm.lora.layers import LoRAMapping ++ from vllm.lora.models import LongContextLoRAContext ++ ++ ++def compute_meta( ++ token_lora_tensor: torch.Tensor ++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, bool]: ++ """ ++ Get the information required for the sgmv kernel. With the features: ++ 1. If consecutive requests in the batch use the same LoRA, this function ++ will combine them into a single request, improving sgmv kernel inference ++ performance. ++ 2. At the beginning of each prefill stage inference, recalculations are ++ needed based on the input, but only once. ++ """ ++ ++ lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( ++ token_lora_tensor, return_counts=True) ++ cum_result = torch.cumsum(seq_length_tensor, dim=0) ++ b_seq_start_tensor = torch.zeros_like(seq_length_tensor) ++ b_seq_start_tensor[1:].copy_(cum_result[:-1]) ++ max_length = seq_length_tensor.max().item() ++ ++ batch_size = lora_indices_tensor.size(0) ++ no_lora = False ++ # -1 means no lora should be applied. Use `no_lora` to determine whether ++ # the current step requires LoRA. If LoRA is not needed, the prefill stage ++ # does not need to launch the triton kernel, which can improve performance ++ if batch_size == 1 and lora_indices_tensor == -1: ++ no_lora = True ++ return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, ++ batch_size, max_length, no_lora) ++ ++ ++# TODO see if this can be vectorized ++def convert_mapping( ++ mapping: "LoRAMapping", ++ lora_index_to_id: List[Optional[int]], ++ max_loras: int, ++ vocab_size: int, ++ extra_vocab_size: int, ++ long_lora_context: Optional["LongContextLoRAContext"] = None, ++ device: torch.device = "cuda", ++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, ++ Optional[torch.Tensor], List[int]]: ++ """Converts LoRAMapping to index tensors. ++ ++ Args: ++ mapping: LoRAMapping mapping rows in a batch to LoRA ids. ++ lora_index_to_id: List mapping LoRA ids to LoRA indices. ++ max_loras: Maximum number of LoRAs. ++ vocab_size: Model vocab size. ++ extra_vocab_size: Extra vocab size each LoRA can have. ++ long_lora_context: Passed if there are long context lora in a batch. ++ ++ Returns: ++ A tuple of tensors: ++ base_indices: Tensor of shape [batch_size] mapping batch rows to ++ LoRA indices. ++ sampler_indices: Tensor of shape [batch_size] mapping requests to ++ LoRA indices for sampler. For generation, this will be the ++ same as base_indicies. For prefill, this will map requests ++ to LoRA indices. ++ sampler_indices_padded: Tensor of shape [batch_size] mapping ++ requests to LoRA indices for sampler with padding. ++ Same as sampler_indicies, but -1 is replaced with ++ max_loras. ++ embeddings_indices: Tensor of shape [2, batch_size] mapping ++ requests to embedding indices. First row is for embeddings ++ added by the LoRAs, second row is for the LoRA.lora_a ++ embeddings. ++ long_lora_indices: Tensor of shape [batch_size] mapping ++ requests to RoPE offsets and rot dims for long LoRAs. ++ None if long context lora doesn't exist. ++ indices_len: List of lengths of the above tensors. It contains ++ (base_indices, sampler_indices, sampler_indices_padded, ++ embeddings_indices, long_lora_indices). ++ """ ++ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() ++ embedding_indices = index_mapping_indices.copy() ++ lora_indices = index_mapping_indices.copy() ++ long_lora_offsets: Optional[torch.Tensor] = None ++ if long_lora_context: ++ long_lora_offsets = torch.zeros(len(index_mapping_indices), ++ device=device, ++ dtype=torch.long) ++ prompt_mapping: List[int] = [ ++ lora_index_to_id.index(x) if x > 0 else -1 ++ for x in mapping.prompt_mapping ++ ] ++ lora_idx = None ++ for i in range(len(index_mapping_indices)): ++ # TODO index can be slow. optimize ++ lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) ++ if index_mapping_indices[i] > 0 else -1) ++ embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 ++ lora_indices[i] = lora_idx ++ if long_lora_context: ++ assert long_lora_offsets is not None ++ lora_offset: int = long_lora_context.offsets_by_lora_id.get( ++ index_mapping_indices[i], 0) ++ long_lora_offsets[i] = lora_offset ++ ++ indices_list: List[Union[List[int], torch.Tensor]] = [ ++ index_mapping_indices, ++ lora_indices, ++ embedding_indices, ++ ] ++ if long_lora_context: ++ assert long_lora_offsets is not None ++ indices_list.append(long_lora_offsets) ++ indices = torch.tensor(indices_list, dtype=torch.long, device=device) ++ prompt_mapping_tensor = torch.tensor(prompt_mapping, ++ device=device, ++ dtype=torch.long) ++ embeddings_indices = torch.stack([ ++ indices[2] * extra_vocab_size, ++ indices[2] * (vocab_size + extra_vocab_size), ++ ]) ++ embeddings_indices[embeddings_indices == -1] = max_loras - 1 ++ base_indices = indices[1] ++ sampler_indices = prompt_mapping_tensor ++ sampler_indices_padded = sampler_indices.clone() ++ sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 ++ sampler_indices_padded = torch.arange( ++ 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( ++ sampler_indices_padded * len(sampler_indices_padded)) ++ long_lora_indices = None ++ long_lora_indices_len: Optional[int] = None ++ if long_lora_context: ++ long_lora_indices = indices[3] ++ long_lora_indices_len = long_lora_indices.shape[-1] ++ # Contain length of indices tensors. Used to index into each tensor. ++ indices_len = [ ++ base_indices.shape[-1], ++ sampler_indices.shape[-1], ++ sampler_indices_padded.shape[-1], ++ embeddings_indices.shape[-1], ++ ] ++ if long_lora_indices_len is not None: ++ indices_len.append(long_lora_indices_len) ++ else: ++ # If long_lora doesn't exist,append None ++ indices_len.append(None) ++ ++ return ( ++ base_indices, ++ sampler_indices, ++ sampler_indices_padded, ++ embeddings_indices, ++ long_lora_indices, ++ indices_len, ++ ) ++ ++ ++class PunicaWrapper: ++ """ ++ PunicaWrapper is designed to manage and provide metadata for the punica ++ kernel. The main function is to maintain the state information for ++ Multi-LoRA, and to provide the interface for the punica kernel. ++ """ ++ ++ def __init__(self, max_num_batched_tokens: int, max_batches: int, ++ device: torch.device): ++ self.device = device ++ self._token_lora_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._sampler_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._sampler_indices_padded = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._embeddings_indices = torch.empty(2, ++ max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._long_lora_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ ++ # 5 is the number of indicies tensors. ++ # base_indices, sampler_indices, sampler_indices_padded, ++ # embeddings_indices,long_lora_indices ++ self.indices_len: List[Optional[int]] = [None] * 5 ++ # these attributes are the information required for sgmv kernel ++ self._seq_start_locs = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self._seq_lengths = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self._lora_indices_per_batch = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self.max_length: int = 0 ++ self.batch_size: int = -1 ++ self.is_prefill = False ++ self.no_lora = False ++ ++ def update_metadata( ++ self, ++ mapping: "LoRAMapping", ++ lora_index_to_id: List[Optional[int]], ++ max_loras: int, ++ vocab_size: int, ++ extra_vocab_size: int, ++ long_lora_context: Optional["LongContextLoRAContext"] = None, ++ ): ++ ++ self._update_base_metadata(mapping, lora_index_to_id, max_loras, ++ vocab_size, extra_vocab_size, ++ long_lora_context) ++ if mapping.is_prefill: ++ # Update metadata required for prefill-related operators. ++ self._update_prefill_metada(self.token_lora_indices) ++ self.is_prefill = True ++ else: ++ self.is_prefill = False ++ ++ def _update_base_metadata( ++ self, ++ mapping: "LoRAMapping", ++ lora_index_to_id: List[Optional[int]], ++ max_loras: int, ++ vocab_size: int, ++ extra_vocab_size: int, ++ long_lora_context: Optional["LongContextLoRAContext"] = None, ++ ): ++ ( ++ base_indices, ++ sampler_indices, ++ sampler_indices_padded, ++ embeddings_indices, ++ long_lora_offsets_tensor, ++ indices_len, ++ ) = convert_mapping( ++ mapping, ++ lora_index_to_id, ++ max_loras, ++ vocab_size, ++ extra_vocab_size, ++ long_lora_context, ++ self.device, ++ ) ++ self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) ++ self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) ++ self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( ++ sampler_indices_padded) ++ self._embeddings_indices[:embeddings_indices. ++ shape[0], :embeddings_indices.shape[1]].copy_( ++ embeddings_indices) ++ if long_lora_offsets_tensor is not None: ++ self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( ++ long_lora_offsets_tensor) ++ else: ++ self._long_lora_indices.zero_() ++ ++ self.indices_len[:] = indices_len ++ ++ def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: ++ ++ (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, ++ batch_size, max_length, no_lora) = compute_meta(token_lora_tensor) ++ ++ self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( ++ b_seq_start_tensor) ++ self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) ++ self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( ++ lora_indices_tensor) ++ self.batch_size = batch_size ++ self.max_length = max_length ++ self.no_lora = no_lora ++ ++ @property ++ def prefill_metadata( ++ self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: ++ """ ++ This property provides a convenient way to access the necessary ++ metadata for prefill-related kernel computations. ++ 1. seq_start_locs: Tensor of sequence start positions ++ 2. seq_lengths: Tensor of sequence lengths ++ 3. lora_indices_per_batch: Tensor of lora indices, and an index of ++ -1 means no lora should be applied. ++ 4. batch_size: batch size after clustering identical lora indices ++ 5. max_length: The maximum sequence length in the batch ++ """ ++ return (self._seq_start_locs[:self.batch_size], ++ self._seq_lengths[:self.batch_size], ++ self._lora_indices_per_batch[:self.batch_size], ++ self.batch_size, self.max_length) ++ ++ @property ++ def token_lora_indices(self) -> torch.Tensor: ++ """ ++ This property provides the lora indices corresponding to each token ++ in the batch. An index of -1 means no lora should be applied. ++ """ ++ token_lora_len = self.indices_len[0] ++ return self._token_lora_indices[:token_lora_len] ++ ++ @property ++ def sampler_indices(self) -> torch.Tensor: ++ """ ++ This property is used to access the lora indices specifically for ++ LogitsProcessorWithLoRA ++ """ ++ sampler_indices_len = self.indices_len[1] ++ return self._sampler_indices[:sampler_indices_len] ++ ++ @property ++ def sampler_indices_padded(self) -> torch.Tensor: ++ """ ++ This property provides access to padded sampler indices ++ """ ++ indices_padded_len = self.indices_len[2] ++ return self._sampler_indices_padded[:indices_padded_len] ++ ++ @property ++ def embeddings_indices(self) -> torch.Tensor: ++ """ ++ This property provides access to the indices used for lora embeddings, ++ specifically for VocabParallelEmbeddingWithLoRA ++ """ ++ embeddings_indices_len = self.indices_len[3] ++ return self._embeddings_indices[:, :embeddings_indices_len] ++ ++ @property ++ def long_lora_indices(self) -> torch.Tensor: ++ """ ++ This property provides access to the indices used for long context ++ lora, specifically for LinearScalingRotaryEmbeddingWithLora ++ """ ++ long_lora_len = self.indices_len[4] ++ return self._long_lora_indices[:long_lora_len] ++ ++ def shrink_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_shrink( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ scale, ++ ) ++ ++ def shrink_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) ++ ++ def expand_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_expand( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ add_input, ++ ) ++ ++ def expand_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool, ++ ): ++ bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) ++ ++ def expand_slice_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_expand_slice( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ y_offset, ++ y_slice_size, ++ add_input, ++ ) ++ ++ def expand_slice_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool, ++ ): ++ bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, ++ y_slice_size, add_input) ++ ++ def add_shrink( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ """ ++ Perform the ` y+=x@w_t_all` computation, which is suitable for the ++ GEMM of lora'a. ++ When `is_prefill is` true, it indicates that it is currently the ++ prefill stage, and the `shrink_prefill` function should be called. ++ Otherwise, it is the decode stage, and the shrink_decode function ++ should be called. ++ """ ++ shrink_fun: Callable = (self.shrink_prefill ++ if self.is_prefill else self.shrink_decode) ++ shrink_fun(y, x, w_t_all, scale) ++ ++ def add_expand( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool = True, ++ ): ++ """ ++ Perform the ` y+=x@w_t_all` computation, which is suitable for the ++ GEMM of lora'b. ++ When `is_prefill` is true, it indicates that it is currently the ++ prefill stage, and the `expand_prefill` function should be called. ++ Otherwise, it is the decode stage, and the expand_decode function ++ should be called. ++ """ ++ ++ expand_fun: Callable = (self.expand_prefill ++ if self.is_prefill else self.expand_decode) ++ expand_fun(y, x, w_t_all, add_input) ++ ++ def add_expand_slice(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool = True): ++ """ ++ Similar to `add_expand` ++ """ ++ ++ expand_slice_fun: Callable = (self.expand_slice_prefill ++ if self.is_prefill else ++ self.expand_slice_decode) ++ expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) ++ ++ def add_lora(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ wa_t_all: torch.Tensor, ++ wb_t_all: torch.Tensor, ++ scale: float, ++ y_offset: Optional[int] = None, ++ y_slice_size: Optional[int] = None, ++ *, ++ buffer: Optional[torch.Tensor] = None) -> None: ++ """ ++ Semantics: ++ y[i] += ( ++ x[i].unsqueeze(0) ++ @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) ++ @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) ++ * scale ++ ).squeeze(0) ++ Args: ++ y (torch.Tensor): Output tensor. Will be changed in-place. ++ x (torch.Tensor): Input tensor ++ wa_t_all (torch.Tensor): lora_a's weight ++ wb_t_all (torch.Tensor): lora_b's weight ++ scale (float): Scaling factor. ++ y_offset (Optional[int], optional): Offset to apply to the starting ++ column of y. ++ y_slice_size (Optional[int], optional): Size of the y column slice.. ++ buffer (Optional[torch.Tensor], optional): Defaults to None. ++ """ ++ y_org = y ++ y = y.view(-1, y.shape[-1]) ++ x = x.view(-1, x.shape[-1]) ++ r = wb_t_all.size(-1) ++ if buffer is None: ++ # We set the buffer to be float32 by default ,refer to: ++ # https://github.com/triton-lang/triton/issues/1387 ++ buffer = torch.zeros((x.size(0), r), ++ dtype=torch.float32, ++ device=x.device) ++ ++ self.add_shrink(buffer, x, wa_t_all, scale) ++ if y_offset is None and y_slice_size is None: ++ self.add_expand(y, buffer, wb_t_all, add_input=True) ++ else: ++ self.add_expand_slice(y, ++ buffer, ++ wb_t_all, ++ y_offset, ++ y_slice_size, ++ add_input=True) ++ y = y.view_as(y_org) ++ ++ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, ++ lora_a_stacked: Tuple[torch.Tensor, ++ torch.Tensor, ++ torch.Tensor], ++ lora_b_stacked: Tuple[torch.Tensor, ++ torch.Tensor, ++ torch.Tensor], ++ scale: float, ++ output_slices: Tuple[int, ...]) -> None: ++ """ ++ Applies lora to each input. Similar to add_lora, This method is ++ used for layers that are composed of multiple sublayers ++ (slices) packed together. ++ """ ++ y_org = y ++ x = x.view(-1, x.shape[-1]) ++ y = y.view(-1, y.shape[-1]) ++ offset_left = 0 ++ # TODO fuse these kernels ++ for slice_idx in range(len(output_slices)): ++ self.add_lora(y, x, lora_a_stacked[slice_idx], ++ lora_b_stacked[slice_idx], scale, offset_left, ++ output_slices[slice_idx]) ++ offset_left += output_slices[slice_idx] ++ ++ y = y.view_as(y_org) ++ ++ def add_lora_logits(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ wa_t_all: torch.Tensor, ++ wb_t_all: torch.Tensor, ++ scale, ++ *, ++ buffer: Optional[torch.Tensor] = None) -> None: ++ """ ++ LogitsProcessorWithLoRA always using bgmv ++ """ ++ y_org = y ++ y = y.view(-1, y.shape[-1]) ++ x = x.view(-1, x.shape[-1]) ++ r = wb_t_all.size(-1) ++ if buffer is None: ++ # We set the buffer to be float32 by default ,refer to: ++ # https://github.com/triton-lang/triton/issues/1387 ++ buffer = torch.zeros((x.size(0), r), ++ dtype=torch.float32, ++ device=x.device) ++ ++ bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) ++ bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) ++ y = y.view_as(y_org) +diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py +index bb6d2808e..8fdb391f7 100644 +--- a/vllm/lora/punica_wrapper/punica_gpu.py ++++ b/vllm/lora/punica_wrapper/punica_gpu.py +@@ -12,11 +12,28 @@ import torch + + import vllm.envs as envs + from vllm.lora.layers import LoRAMapping ++from vllm.platforms import current_platform + from vllm.triton_utils import HAS_TRITON + +-if HAS_TRITON: ++if HAS_TRITON and not current_platform.is_xpu(): + from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand, + lora_shrink) ++elif current_platform.is_xpu(): ++ from vllm._ipex_ops import ipex_ops ++ try: ++ lora_expand = ipex_ops.lora_expand ++ lora_shrink = ipex_ops.lora_shrink ++ XPU_KERNEL_V = 1 ++ except AttributeError: ++ from vllm._ipex_ops import ipex_ops ++ bgmv_expand = ipex_ops.bgmv_expand ++ bgmv_expand_slice = ipex_ops.bgmv_expand_slice ++ bgmv_shrink = ipex_ops.bgmv_shrink ++ sgmv_expand = ipex_ops.sgmv_expand ++ sgmv_expand_slice = ipex_ops.sgmv_expand_slice ++ sgmv_shrink = ipex_ops.sgmv_shrink ++ XPU_KERNEL_V = 0 ++ + + from .punica_base import PunicaWrapperBase + +@@ -40,9 +57,10 @@ class PunicaWrapperGPU(PunicaWrapperBase): + + self.max_loras = kwargs['max_loras'] + +- self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras, +- max_num_batched_tokens, +- device=device) ++ if not (current_platform.is_xpu() and XPU_KERNEL_V == 0): ++ self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras, ++ max_num_batched_tokens, ++ device=device) + + # When cudagraph capture size is greater than max_num_seqs (max_batches, + # here), V0 captures the graph as if max_num_seqs is set to +@@ -50,9 +68,10 @@ class PunicaWrapperGPU(PunicaWrapperBase): + # V1 doesn't have this problem and always respects max_num_seqs. + max_num_prompts = (max_batches + if envs.VLLM_USE_V1 else max_num_batched_tokens) +- self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, +- max_num_prompts, +- device=device) ++ if not (current_platform.is_xpu() and XPU_KERNEL_V == 0): ++ self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, ++ max_num_prompts, ++ device=device) + + def update_metadata( + self, +@@ -65,13 +84,79 @@ class PunicaWrapperGPU(PunicaWrapperBase): + **kwargs): + + self.is_prefill = mapping.is_prefill +- self._update_base_metadata(mapping, lora_index_to_id, max_loras, +- vocab_size, extra_vocab_size, +- long_lora_context) ++ if current_platform.is_xpu() and XPU_KERNEL_V == 0: ++ PunicaWrapperBase.update_metadata(self, mapping, lora_index_to_id, ++ max_loras, vocab_size, ++ extra_vocab_size, ++ long_lora_context, **kwargs) ++ else: ++ self._update_base_metadata(mapping, lora_index_to_id, max_loras, ++ vocab_size, extra_vocab_size, ++ long_lora_context) ++ # Prepare cuda kernel metadata tensors ++ self.token_mapping_meta.prepare_tensors(self.token_lora_indices) ++ self.prompt_mapping_meta.prepare_tensors(self.sampler_indices) ++ ++ def _apply_shrink_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: Tuple[torch.Tensor, ...], ++ scale: float, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_shrink( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ scale, ++ ) ++ ++ def _apply_shrink_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) ++ ++ def _apply_expand_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: Tuple[torch.Tensor, ...], ++ offset_start: int, ++ add_inputs: bool, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ ++ sgmv_expand( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ offset_start=offset_start, ++ add_inputs=add_inputs, ++ ) ++ ++ def _apply_expand_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_inputs: bool, ++ ): ++ bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, ++ y_slice_size, add_inputs) + +- # Prepare cuda kernel metadata tensors +- self.token_mapping_meta.prepare_tensors(self.token_lora_indices) +- self.prompt_mapping_meta.prepare_tensors(self.sampler_indices) + + def add_shrink(self, y: torch.Tensor, x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, +@@ -91,13 +176,20 @@ class PunicaWrapperGPU(PunicaWrapperBase): + """ + + x = x.view(-1, x.shape[-1]) +- lora_shrink( +- x, +- lora_a_stacked, +- y, +- *self.token_mapping_meta.meta_args(x.size(0)), +- scale, +- ) ++ if current_platform.is_xpu() and XPU_KERNEL_V == 0: ++ for slice_idx in range(len(lora_a_stacked)): ++ self._apply_shrink_decode(y[slice_idx], x, ++ lora_a_stacked[slice_idx], scale) ++ else: ++ meta_args = self.token_mapping_meta.meta_args(x.size(0)) ++ ++ lora_shrink( ++ x, ++ lora_a_stacked, ++ y, ++ *self.token_mapping_meta.meta_args(x.size(0)), ++ scale, ++ ) + + def add_expand(self, + y: torch.Tensor, +@@ -137,17 +229,29 @@ class PunicaWrapperGPU(PunicaWrapperBase): + + assert x.ndim == 3 + assert x.size(0) == len(output_slices) +- num_tokens = x.size(1) # first dimension is the num slices +- +- lora_expand( +- x, +- lora_b_stacked, +- y, +- *self.token_mapping_meta.meta_args(num_tokens), +- offset_start=offset_start, +- add_inputs=True, +- ) + ++ if current_platform.is_xpu() and XPU_KERNEL_V == 0: ++ # TODO fuse these kernels ++ for slice_idx in range(len(lora_b_stacked)): ++ self._apply_expand_decode( ++ y, ++ x[slice_idx], ++ lora_b_stacked[slice_idx], ++ offset_start, ++ output_slices[slice_idx], ++ add_inputs=add_inputs, ++ ) ++ offset_start += output_slices[slice_idx] ++ else: ++ num_tokens = x.size(1) # first dimension is the num slices ++ lora_expand( ++ x, ++ lora_b_stacked, ++ y, ++ *self.token_mapping_meta.meta_args(num_tokens), ++ offset_start=offset_start, ++ add_inputs=True, ++ ) + y = y.view_as(y_org) + + def add_lora_embedding(self, +@@ -169,14 +273,18 @@ class PunicaWrapperGPU(PunicaWrapperBase): + add_inputs (bool): Default to True. + """ + +- lora_expand( +- x.unsqueeze(dim=0), +- (lora_b_stacked, ), +- y, +- *self.token_mapping_meta.meta_args(x.size(0)), +- offset_start=0, +- add_inputs=add_inputs, +- ) ++ if current_platform.is_xpu() and XPU_KERNEL_V == 0: ++ bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices, ++ add_inputs) ++ else: ++ lora_expand( ++ x.unsqueeze(dim=0), ++ (lora_b_stacked, ), ++ y, ++ *self.token_mapping_meta.meta_args(x.size(0)), ++ offset_start=0, ++ add_inputs=add_inputs, ++ ) + + def add_lora_linear(self, + y: torch.Tensor, +@@ -279,11 +387,19 @@ class PunicaWrapperGPU(PunicaWrapperBase): + dtype=torch.float32, + device=x.device) + +- lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0), +- *self.prompt_mapping_meta.meta_args(x.size(0)), scale) +- +- lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked], +- y, +- *self.prompt_mapping_meta.meta_args(buffer.size(0)), +- add_inputs=True) ++ if current_platform.is_xpu() and XPU_KERNEL_V == 0: ++ bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) ++ bgmv_expand(buffer, ++ lora_b_stacked, ++ y, ++ self.sampler_indices, ++ add_inputs=True) ++ else: ++ lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0), ++ *self.prompt_mapping_meta.meta_args(x.size(0)), scale) ++ ++ lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked], ++ y, ++ *self.prompt_mapping_meta.meta_args(buffer.size(0)), ++ add_inputs=True) + y = y.view_as(y_org) +diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py +index b44301f1a..d7e600e9b 100644 +--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py ++++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py +@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List + + import torch + ++import vllm.envs + from vllm.logger import init_logger + + try: +@@ -131,8 +132,13 @@ class GrammarCompilerCache: + encoded_vocab=config_data.encoded_vocab, + metadata=config_data.metadata, + ) ++ cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024 + cls._cache[cache_key] = xgr.GrammarCompiler( +- tokenizer_info, max_threads=config.max_threads) ++ tokenizer_info, ++ max_threads=config.max_threads, ++ cache_enabled=True, ++ cache_limit_bytes=cache_size, ++ ) + + return cls._cache[cache_key] + +diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py +index 0e35d8a80..c4edc39c1 100644 +--- a/vllm/model_executor/layers/fused_moe/layer.py ++++ b/vllm/model_executor/layers/fused_moe/layer.py +@@ -132,7 +132,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + layer.w2_weight = torch.nn.Parameter(shuffled_w2, + requires_grad=False) + +- if current_platform.is_cpu(): ++ if current_platform.is_xpu(): ++ import intel_extension_for_pytorch as ipex ++ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( ++ layer.w13_weight, ++ layer.w2_weight, ++ use_prepack=True, ++ ) ++ elif current_platform.is_cpu(): + if current_platform.get_cpu_architecture() == CpuArchEnum.X86: + import intel_extension_for_pytorch as ipex + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( +@@ -287,6 +294,30 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight, + router_logits, top_k) + ++ def forward_xpu( ++ self, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ use_grouped_topk: bool, ++ top_k: int, ++ router_logits: torch.Tensor, ++ renormalize: bool, ++ topk_group: Optional[int] = None, ++ num_expert_group: Optional[int] = None, ++ custom_routing_function: Optional[Callable] = None, ++ **kwargs, ++ ): ++ assert custom_routing_function is None ++ return layer.ipex_fusion( ++ x, ++ use_grouped_topk, ++ top_k, ++ router_logits, ++ renormalize, ++ topk_group, ++ num_expert_group, ++ ) ++ + def forward_tpu( + self, + layer: torch.nn.Module, +diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py +index 4435644c4..c2320a345 100644 +--- a/vllm/model_executor/layers/quantization/fp8.py ++++ b/vllm/model_executor/layers/quantization/fp8.py +@@ -169,7 +169,7 @@ class Fp8LinearMethod(LinearMethodBase): + self.use_marlin = (not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN) + # Disable marlin for rocm +- if current_platform.is_rocm(): ++ if current_platform.is_rocm() or current_platform.is_xpu(): + self.use_marlin = False + + self.block_quant = self.quant_config.weight_block_size is not None +@@ -326,7 +326,10 @@ class Fp8LinearMethod(LinearMethodBase): + layer.logical_widths) + + # Update the layer with the new values. +- layer.weight = Parameter(qweight.t(), requires_grad=False) ++ if current_platform.is_xpu(): ++ layer.weight = Parameter(qweight, requires_grad=False) ++ else: ++ layer.weight = Parameter(qweight.t(), requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.input_scale = None + +@@ -385,6 +388,11 @@ class Fp8LinearMethod(LinearMethodBase): + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: ++ if current_platform.is_xpu(): ++ weight = layer.weight.data ++ scale = layer.weight_scale.data ++ output = torch.ops.torch_ipex.fp8_gemm2(x, False, weight, True, None, x.dtype, None, scale, bias, False) ++ return output + + if self.use_marlin: + return apply_fp8_marlin_linear( +@@ -667,6 +675,21 @@ class Fp8MoEMethod(FusedMoEMethodBase): + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, + requires_grad=False) ++ ++ if current_platform.is_xpu(): ++ import intel_extension_for_pytorch as ipex ++ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( ++ layer.w13_weight, ++ layer.w2_weight, ++ w1_scale_inv=(layer.w13_weight_scale_inv ++ if self.block_quant else layer.w13_weight_scale), ++ w2_scale_inv=(layer.w2_weight_scale_inv ++ if self.block_quant else layer.w2_weight_scale), ++ a1_scale_inv=layer.w13_input_scale, ++ a2_scale_inv=layer.w2_input_scale, ++ use_prepack=True, ++ ) ++ + return + + # If checkpoint is fp8, we need to handle that the +@@ -776,6 +799,23 @@ class Fp8MoEMethod(FusedMoEMethodBase): + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: ++ if current_platform.is_xpu(): ++ return self.forward_xpu( ++ x=x, ++ layer=layer, ++ router_logits=router_logits, ++ top_k=top_k, ++ renormalize=renormalize, ++ use_grouped_topk=use_grouped_topk, ++ topk_group=topk_group, ++ num_expert_group=num_expert_group, ++ global_num_experts=global_num_experts, ++ expert_map=expert_map, ++ custom_routing_function=custom_routing_function, ++ scoring_func=scoring_func, ++ e_score_correction_bias=e_score_correction_bias, ++ activation=activation, ++ apply_router_weight_on_input=apply_router_weight_on_input) + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( +@@ -813,6 +853,29 @@ class Fp8MoEMethod(FusedMoEMethodBase): + allow_deep_gemm=self.allow_deep_gemm, + ) + ++ def forward_xpu( ++ self, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ use_grouped_topk: bool, ++ top_k: int, ++ router_logits: torch.Tensor, ++ renormalize: bool, ++ topk_group: Optional[int] = None, ++ num_expert_group: Optional[int] = None, ++ custom_routing_function: Optional[Callable] = None, ++ **kwargs, ++ ): ++ assert custom_routing_function is None ++ return layer.ipex_fusion( ++ x, ++ use_grouped_topk, ++ top_k, ++ router_logits, ++ renormalize, ++ topk_group, ++ num_expert_group, ++ ) + + class Fp8KVCacheMethod(BaseKVCacheMethod): + """ +diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py +index c09cc13cb..309664e8d 100644 +--- a/vllm/model_executor/layers/quantization/ipex_quant.py ++++ b/vllm/model_executor/layers/quantization/ipex_quant.py +@@ -3,6 +3,7 @@ + from typing import Any, Dict, List, Optional + + import torch ++import time + + from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +@@ -11,11 +12,19 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod, + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod ++from vllm.model_executor.parameter import (ModelWeightParameter, ++ PerTensorScaleParameter) ++from vllm.model_executor.utils import set_weight_attrs + from vllm.platforms import current_platform + +-MIN_IPEX_VERSION = "2.5.0" ++from vllm.model_executor.layers.quantization import register_quantization_config ++from vllm.model_executor.layers.quantization import get_quantization_config ++ + ++MIN_IPEX_VERSION = "2.5.0" ++ACTIVATION_SCHEMES = ["static", "dynamic"] + ++@register_quantization_config("auto-round") + class IPEXConfig(QuantizationConfig): + """INT8 quantization config class using IPEX for the CPU/XPU backend, + including AWQ, GPTQ. +@@ -34,6 +43,7 @@ class IPEXConfig(QuantizationConfig): + modules_to_not_convert: Optional[List[str]] = None, + desc_act: Optional[bool] = None, + lm_head_quantized: Optional[bool] = None, ++ is_checkpoint_fp8_serialized: bool = False, + ) -> None: + super().__init__() + self.method = method +@@ -43,14 +53,15 @@ class IPEXConfig(QuantizationConfig): + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized + self.pack_factor = 32 // self.weight_bits +- +- if self.weight_bits not in [4]: +- raise ValueError(f"IPEX quantization supports weight bits [4], " +- f"but got {self.weight_bits}.") +- +- if self.method not in ["awq", "gptq"]: +- raise ValueError(f"IPEX quantization supports [awq, gptq], " ++ self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized ++ if self.method not in ["awq", "gptq", "auto-round", "fp8"]: ++ raise ValueError(f"IPEX quantization supports [awq, gptq, auto-round, fp8], " + f"but got {self.method}.") ++ if is_checkpoint_fp8_serialized: ++ self.quant_method = "fp8" ++ print("Detected fp8 checkpoint. Please note that the " ++ "format is experimental and subject to change.") ++ self.activation_scheme = "dynamic" + + def __repr__(self) -> str: + return (f"IPEXConfig(method={self.method}," +@@ -92,9 +103,13 @@ class IPEXConfig(QuantizationConfig): + group_size = cls.get_from_keys(config, ["group_size"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], + default=False) ++ data_type = cls.get_from_keys_or(config, ["data_type"], ++ default="int4") ++ is_checkpoint_fp8_serialized = ("fp8" in data_type) ++ + desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False) + return cls(method, weight_bits, group_size, [], desc_act, +- lm_head_quantized) ++ lm_head_quantized, is_checkpoint_fp8_serialized) + + @classmethod + def override_quantization_method(cls, hf_quant_cfg, +@@ -104,7 +119,7 @@ class IPEXConfig(QuantizationConfig): + + quant_method = hf_quant_cfg.get("quant_method", "").lower() + +- if quant_method in ["awq", "gptq"]: ++ if quant_method in ["awq", "gptq", "auto-round", "fp8"]: + return cls.get_name() + + return None +@@ -118,8 +133,86 @@ class IPEXConfig(QuantizationConfig): + return IPEXAWQLinearMethod(self) + if self.method == "gptq": + return IPEXGPTQLinearMethod(self) ++ if self.method == "auto-round" or self.method == "fp8": ++ return IPEXAutoRoundLinearMethod(self) + return None + ++get_quantization_config("auto-round") ++ ++class IPEXAutoRoundLinearMethod(LinearMethodBase): ++ def __init__(self, quant_config: IPEXConfig): ++ self.quant_config = quant_config ++ self.out_dtype = torch.get_default_dtype() ++ ++ def create_weights( ++ self, ++ layer: torch.nn.Module, ++ input_size_per_partition: int, ++ output_partition_sizes: List[int], ++ input_size: int, ++ output_size: int, ++ params_dtype: torch.dtype, ++ **extra_weight_attrs, ++ ): ++ # maybe_create_device_identity() ++ ++ output_size_per_partition = sum(output_partition_sizes) ++ weight_loader = extra_weight_attrs.get("weight_loader") ++ layer.logical_widths = output_partition_sizes ++ ++ layer.input_size_per_partition = input_size_per_partition ++ layer.output_size_per_partition = output_size_per_partition ++ layer.orig_dtype = params_dtype ++ # WEIGHT ++ weight_dtype = (torch.float8_e5m2 ++ if self.quant_config.is_checkpoint_fp8_serialized else ++ params_dtype) ++ ++ weight = ModelWeightParameter(data=torch.empty( ++ output_size_per_partition, ++ input_size_per_partition, ++ dtype=weight_dtype), ++ input_dim=1, ++ output_dim=0, ++ weight_loader=weight_loader) ++ layer.register_parameter("weight", weight) ++ ++ # If checkpoint is serialized fp8, load them. ++ # Otherwise, wait until process_weights_after_loading. ++ if self.quant_config.is_checkpoint_fp8_serialized: ++ # WEIGHT SCALE ++ scale = PerTensorScaleParameter( ++ data=torch.empty(len(output_partition_sizes), ++ dtype=torch.float32), ++ weight_loader=weight_loader, ++ ) ++ scale[:] = torch.finfo(torch.float32).min ++ set_weight_attrs(scale, {"scale_type": "weight_scale"}) ++ set_weight_attrs(scale, {"needs_scalar_to_array": True}) ++ layer.register_parameter("weight_scale", scale) ++ # INPUT ACTIVATION SCALE ++ if self.quant_config.activation_scheme == "static": ++ scale = PerTensorScaleParameter(data=torch.empty( ++ 1, dtype=torch.float32), ++ weight_loader=weight_loader) ++ ++ scale[:] = torch.finfo(torch.float32).min ++ set_weight_attrs(scale, {"scale_type": "input_scale"}) ++ layer.register_parameter("input_scale", scale) ++ else: ++ layer.register_parameter("input_scale", None) ++ ++ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ++ pass ++ ++ def apply(self, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ bias: Optional[torch.Tensor] = None) -> torch.Tensor: ++ weight = layer.weight.data ++ scale = layer.weight_scale.data ++ output = torch.ops.torch_ipex.fp8_gemm2(x, False, weight, True, None, x.dtype, None, scale, bias, False) ++ return output + + class IPEXGPTQLinearMethod(GPTQLinearMethod): + """GPTQ linear method using IPEX for the CPU/XPU backend. +@@ -129,8 +222,6 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod): + self.quant_config = quant_config # type: ignore + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: +- bias = layer.bias if not layer.skip_bias_add else None +- + try: + import intel_extension_for_pytorch as ipex + if ipex.__version__ < MIN_IPEX_VERSION: +@@ -169,7 +260,7 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod): + layer.ipex_output_size, + qconfig=qconfig, + g_idx=g_idx, +- bias=bias, ++ bias=None, + group_size=self.quant_config.group_size, + quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"] + ) +diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py +index 6a3112b5f..bb22a9b7f 100644 +--- a/vllm/model_executor/models/baichuan.py ++++ b/vllm/model_executor/models/baichuan.py +@@ -42,7 +42,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope + from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler + from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +-from vllm.model_executor.model_loader.weight_utils import default_weight_loader ++from vllm.model_executor.model_loader.weight_utils import ( ++ default_weight_loader, row_parallel_weight_loader) + from vllm.model_executor.sampling_metadata import SamplingMetadata + from vllm.sequence import IntermediateTensors + +@@ -384,7 +385,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, + lora_config = vllm_config.lora_config + self.config = config + self.lora_config = lora_config +- ++ self.tp_size = get_tensor_model_parallel_world_size() + self.quant_config = quant_config + self.model = BaiChuanModel(vllm_config=vllm_config, + prefix=prefix, +@@ -447,8 +448,10 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, + is_baichuan2 = self.config.vocab_size == 125696 + if is_baichuan2: + loaded_weight = torch.nn.functional.normalize(loaded_weight) +- +- default_weight_loader(param, loaded_weight) ++ if self.tp_size > 1: ++ row_parallel_weight_loader(param, loaded_weight) ++ else: ++ default_weight_loader(param, loaded_weight) + + + class BaichuanForCausalLM(BaiChuanBaseForCausalLM): +diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py +index d125c666f..6fd3f1a02 100644 +--- a/vllm/model_executor/models/gemma2.py ++++ b/vllm/model_executor/models/gemma2.py +@@ -146,10 +146,7 @@ class Gemma2Attention(nn.Module): + # reference: + # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa + layer_idx = extract_layer_index(prefix) +- use_sliding_window = (layer_idx % 2 == 0 and +- config.interleaved_sliding_window is not None) +- sliding_window = config.interleaved_sliding_window if \ +- use_sliding_window else None ++ sliding_window = config.sliding_window if not bool(layer_idx % 2) else None + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, +diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py +index fb8eccc55..875f0a80c 100644 +--- a/vllm/model_executor/models/gemma3.py ++++ b/vllm/model_executor/models/gemma3.py +@@ -153,7 +153,7 @@ class Gemma3Attention(nn.Module): + # Local attention. Override the values in config.json. + self.rope_theta = config.rope_local_base_freq + self.rope_scaling = {"rope_type": "default"} +- self.sliding_window = config.interleaved_sliding_window ++ self.sliding_window = config.sliding_window + else: + # Global attention. Use the values in config.json. + self.rope_theta = config.rope_theta +diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py +index 4a5982ecb..440246f96 100644 +--- a/vllm/model_executor/models/llama.py ++++ b/vllm/model_executor/models/llama.py +@@ -402,6 +402,9 @@ class LlamaModel(nn.Module): + if "scale" in name: + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) ++ # temp fix for unit scale INC model, will can be removed ++ if "proj.scale" in name and not "scales" in name: ++ name = name.replace("scale", "weight_scale") + if name is None: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: +diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py +index c4d02e5dd..950a09a8e 100644 +--- a/vllm/model_executor/models/qwen2.py ++++ b/vllm/model_executor/models/qwen2.py +@@ -263,7 +263,11 @@ class Qwen2DecoderLayer(nn.Module): + }) + class Qwen2Model(nn.Module): + +- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ def __init__(self, ++ *, ++ vllm_config: VllmConfig, ++ prefix: str = "", ++ decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer): + super().__init__() + + config = vllm_config.model_config.hf_config +@@ -297,9 +301,11 @@ class Qwen2Model(nn.Module): + else: + self.embed_tokens = PPMissingLayer() + ++ # Use the provided decoder layer type or default to Qwen2DecoderLayer ++ decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, +- lambda prefix: Qwen2DecoderLayer(config=config, ++ lambda prefix: decoder_layer_type(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), +diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py +index 1e6ff1fec..0c50c7343 100644 +--- a/vllm/model_executor/models/qwen2_5_vl.py ++++ b/vllm/model_executor/models/qwen2_5_vl.py +@@ -1073,10 +1073,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + if image_input is None and video_input is None: + inputs_embeds = None + else: +- if uses_mrope(self.config): +- assert positions.ndim == 2 and positions.size(0) == 3, ( +- "multimodal section rotary embedding requires " +- f"(3, seq_len) positions, but got {positions.size()}") ++ # if uses_mrope(self.config): ++ # assert positions.ndim == 2 and positions.size(0) == 3, ( ++ # "multimodal section rotary embedding requires " ++ # f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, +diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py +new file mode 100644 +index 000000000..9c14038e6 +--- /dev/null ++++ b/vllm/model_executor/models/qwen3.py +@@ -0,0 +1,329 @@ ++# SPDX-License-Identifier: Apache-2.0 ++ ++# Copyright 2024 The Qwen team. ++# Copyright 2023 The vLLM team. ++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. ++# ++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX ++# and OPT implementations in this library. It has been modified from its ++# original forms to accommodate minor architectural differences compared ++# to GPT-NeoX and OPT used by the Meta AI team that trained the model. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++"""Inference-only Qwen3 model compatible with HuggingFace weights.""" ++from typing import Iterable, Optional, Set, Tuple, Union ++ ++import torch ++from torch import nn ++from transformers import Qwen3Config ++ ++from vllm.attention import Attention, AttentionType ++from vllm.compilation.decorators import support_torch_compile ++from vllm.config import CacheConfig, VllmConfig ++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size ++from vllm.logger import init_logger ++from vllm.model_executor.layers.layernorm import RMSNorm ++from vllm.model_executor.layers.linear import (QKVParallelLinear, ++ RowParallelLinear) ++from vllm.model_executor.layers.logits_processor import LogitsProcessor ++from vllm.model_executor.layers.quantization import QuantizationConfig ++from vllm.model_executor.layers.rotary_embedding import get_rope ++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ++from vllm.model_executor.sampling_metadata import SamplingMetadata ++from vllm.sequence import IntermediateTensors ++ ++from .interfaces import SupportsLoRA, SupportsPP ++from .qwen2 import Qwen2MLP as Qwen3MLP ++from .qwen2 import Qwen2Model ++from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix ++ ++logger = init_logger(__name__) ++ ++ ++class Qwen3Attention(nn.Module): ++ ++ def __init__(self, ++ hidden_size: int, ++ num_heads: int, ++ num_kv_heads: int, ++ max_position: int = 4096 * 32, ++ head_dim: Optional[int] = None, ++ rms_norm_eps: float = 1e-06, ++ qkv_bias: bool = False, ++ rope_theta: float = 10000, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ rope_scaling: Optional[Tuple] = None, ++ prefix: str = "", ++ attn_type: str = AttentionType.DECODER) -> None: ++ super().__init__() ++ self.hidden_size = hidden_size ++ tp_size = get_tensor_model_parallel_world_size() ++ self.total_num_heads = num_heads ++ assert self.total_num_heads % tp_size == 0 ++ self.num_heads = self.total_num_heads // tp_size ++ self.total_num_kv_heads = num_kv_heads ++ if self.total_num_kv_heads >= tp_size: ++ # Number of KV heads is greater than TP size, so we partition ++ # the KV heads across multiple tensor parallel GPUs. ++ assert self.total_num_kv_heads % tp_size == 0 ++ else: ++ # Number of KV heads is less than TP size, so we replicate ++ # the KV heads across multiple tensor parallel GPUs. ++ assert tp_size % self.total_num_kv_heads == 0 ++ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) ++ self.head_dim = head_dim or hidden_size // self.total_num_heads ++ self.q_size = self.num_heads * self.head_dim ++ self.kv_size = self.num_kv_heads * self.head_dim ++ self.scaling = self.head_dim**-0.5 ++ self.rope_theta = rope_theta ++ ++ self.qkv_proj = QKVParallelLinear( ++ hidden_size, ++ self.head_dim, ++ self.total_num_heads, ++ self.total_num_kv_heads, ++ bias=qkv_bias, ++ quant_config=quant_config, ++ prefix=f"{prefix}.qkv_proj", ++ ) ++ self.o_proj = RowParallelLinear( ++ self.total_num_heads * self.head_dim, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.o_proj", ++ ) ++ ++ self.rotary_emb = get_rope( ++ self.head_dim, ++ rotary_dim=self.head_dim, ++ max_position=max_position, ++ base=self.rope_theta, ++ rope_scaling=rope_scaling, ++ ) ++ self.attn = Attention(self.num_heads, ++ self.head_dim, ++ self.scaling, ++ num_kv_heads=self.num_kv_heads, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.attn", ++ attn_type=attn_type) ++ self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ ) -> torch.Tensor: ++ qkv, _ = self.qkv_proj(hidden_states) ++ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ++ # Add qk-norm ++ q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, ++ self.head_dim) ++ q_by_head = self.q_norm.forward_native(q_by_head) ++ q = q_by_head.view(q.shape) ++ k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, ++ self.head_dim) ++ k_by_head = self.k_norm.forward_native(k_by_head) ++ k = k_by_head.view(k.shape) ++ q, k = self.rotary_emb(positions, q, k) ++ attn_output = self.attn(q, k, v) ++ output, _ = self.o_proj(attn_output) ++ return output ++ ++ ++class Qwen3DecoderLayer(nn.Module): ++ ++ def __init__( ++ self, ++ config: Qwen3Config, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = config.hidden_size ++ # Requires transformers > 4.32.0 ++ rope_theta = getattr(config, "rope_theta", 1000000) ++ rope_scaling = getattr(config, "rope_scaling", None) ++ ++ # By default, Qwen3 uses causal attention as it is a decoder-only model. ++ # You can override the HF config with `is_causal=False` to enable ++ # bidirectional attention, which is used in some embedding models ++ # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct) ++ if getattr(config, "is_causal", True): ++ attn_type = AttentionType.DECODER ++ else: ++ attn_type = AttentionType.ENCODER_ONLY ++ ++ self.self_attn = Qwen3Attention( ++ hidden_size=self.hidden_size, ++ num_heads=config.num_attention_heads, ++ max_position=config.max_position_embeddings, ++ num_kv_heads=config.num_key_value_heads, ++ rope_theta=rope_theta, ++ rms_norm_eps=config.rms_norm_eps, ++ qkv_bias=getattr(config, 'attention_bias', False), ++ head_dim=getattr(config, 'head_dim', None), ++ cache_config=cache_config, ++ quant_config=quant_config, ++ rope_scaling=rope_scaling, ++ prefix=f"{prefix}.self_attn", ++ attn_type=attn_type, ++ ) ++ self.mlp = Qwen3MLP( ++ hidden_size=self.hidden_size, ++ intermediate_size=config.intermediate_size, ++ hidden_act=config.hidden_act, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp", ++ ) ++ self.input_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_attention_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ residual: Optional[torch.Tensor], ++ ) -> Tuple[torch.Tensor, torch.Tensor]: ++ # Self Attention ++ if residual is None: ++ residual = hidden_states ++ hidden_states = self.input_layernorm(hidden_states) ++ else: ++ hidden_states, residual = self.input_layernorm( ++ hidden_states, residual) ++ hidden_states = self.self_attn( ++ positions=positions, ++ hidden_states=hidden_states, ++ ) ++ ++ # Fully Connected ++ hidden_states, residual = self.post_attention_layernorm( ++ hidden_states, residual) ++ hidden_states = self.mlp(hidden_states) ++ return hidden_states, residual ++ ++ ++ALL_DECODER_LAYER_TYPES = { ++ "attention": Qwen3DecoderLayer, ++} ++ ++ ++@support_torch_compile( ++ dynamic_arg_dims={ ++ "input_ids": 0, ++ # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, ++ # otherwise (seq_len, ). ++ "positions": -1, ++ "intermediate_tensors": 0, ++ "inputs_embeds": 0, ++ }) ++class Qwen3Model(Qwen2Model): ++ ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__(vllm_config=vllm_config, ++ prefix=prefix, ++ decoder_layer_type=Qwen3DecoderLayer) ++ ++ ++class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ++ packed_modules_mapping = { ++ "qkv_proj": [ ++ "q_proj", ++ "k_proj", ++ "v_proj", ++ ], ++ "gate_up_proj": [ ++ "gate_proj", ++ "up_proj", ++ ], ++ } ++ ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ config = vllm_config.model_config.hf_config ++ quant_config = vllm_config.quant_config ++ lora_config = vllm_config.lora_config ++ ++ self.config = config ++ self.lora_config = lora_config ++ ++ self.quant_config = quant_config ++ self.model = Qwen3Model(vllm_config=vllm_config, ++ prefix=maybe_prefix(prefix, "model")) ++ ++ if get_pp_group().is_last_rank: ++ if config.tie_word_embeddings: ++ self.lm_head = self.model.embed_tokens ++ else: ++ self.lm_head = ParallelLMHead(config.vocab_size, ++ config.hidden_size, ++ quant_config=quant_config, ++ prefix=maybe_prefix( ++ prefix, "lm_head")) ++ else: ++ self.lm_head = PPMissingLayer() ++ ++ self.logits_processor = LogitsProcessor(config.vocab_size) ++ self.sampler = get_sampler() ++ ++ self.make_empty_intermediate_tensors = ( ++ self.model.make_empty_intermediate_tensors) ++ ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.model.get_input_embeddings(input_ids) ++ ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ hidden_states = self.model(input_ids, positions, intermediate_tensors, ++ inputs_embeds) ++ return hidden_states ++ ++ def compute_logits( ++ self, ++ hidden_states: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[torch.Tensor]: ++ logits = self.logits_processor(self.lm_head, hidden_states, ++ sampling_metadata) ++ return logits ++ ++ def sample( ++ self, ++ logits: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[SamplerOutput]: ++ next_tokens = self.sampler(logits, sampling_metadata) ++ return next_tokens ++ ++ def load_weights(self, weights: Iterable[Tuple[str, ++ torch.Tensor]]) -> Set[str]: ++ loader = AutoWeightsLoader( ++ self, ++ skip_prefixes=(["lm_head."] ++ if self.config.tie_word_embeddings else None), ++ ) ++ return loader.load_weights(weights) +diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py +new file mode 100644 +index 000000000..390bb7adf +--- /dev/null ++++ b/vllm/model_executor/models/qwen3_moe.py +@@ -0,0 +1,531 @@ ++# SPDX-License-Identifier: Apache-2.0 ++ ++# Copyright 2024 The Qwen team. ++# Copyright 2023 The vLLM team. ++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. ++# ++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX ++# and OPT implementations in this library. It has been modified from its ++# original forms to accommodate minor architectural differences compared ++# to GPT-NeoX and OPT used by the Meta AI team that trained the model. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++"""Inference-only Qwen3MoE model compatible with HuggingFace weights.""" ++from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union ++ ++import torch ++from torch import nn ++from transformers import PretrainedConfig ++ ++from vllm.attention import Attention ++from vllm.compilation.decorators import support_torch_compile ++from vllm.config import CacheConfig, VllmConfig ++from vllm.distributed import (get_pp_group, ++ get_tensor_model_parallel_world_size, ++ tensor_model_parallel_all_reduce) ++from vllm.logger import init_logger ++from vllm.model_executor.layers.activation import SiluAndMul ++from vllm.model_executor.layers.fused_moe import FusedMoE ++from vllm.model_executor.layers.layernorm import RMSNorm ++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, ++ QKVParallelLinear, ++ ReplicatedLinear, ++ RowParallelLinear) ++from vllm.model_executor.layers.logits_processor import LogitsProcessor ++from vllm.model_executor.layers.quantization import QuantizationConfig ++from vllm.model_executor.layers.rotary_embedding import get_rope ++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ++from vllm.model_executor.layers.vocab_parallel_embedding import ( ++ ParallelLMHead, VocabParallelEmbedding) ++from vllm.model_executor.model_loader.weight_utils import default_weight_loader ++from vllm.model_executor.sampling_metadata import SamplingMetadata ++from vllm.sequence import IntermediateTensors ++ ++from .interfaces import SupportsPP ++from .utils import (extract_layer_index, is_pp_missing_parameter, ++ make_empty_intermediate_tensors_factory, make_layers, ++ maybe_prefix) ++ ++logger = init_logger(__name__) ++ ++ ++class Qwen3MoeMLP(nn.Module): ++ ++ def __init__( ++ self, ++ hidden_size: int, ++ intermediate_size: int, ++ hidden_act: str, ++ quant_config: Optional[QuantizationConfig] = None, ++ reduce_results: bool = True, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.gate_up_proj = MergedColumnParallelLinear( ++ hidden_size, [intermediate_size] * 2, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.gate_up_proj") ++ self.down_proj = RowParallelLinear(intermediate_size, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ reduce_results=reduce_results, ++ prefix=f"{prefix}.down_proj") ++ if hidden_act != "silu": ++ raise ValueError(f"Unsupported activation: {hidden_act}. " ++ "Only silu is supported for now.") ++ self.act_fn = SiluAndMul() ++ ++ def forward(self, x): ++ gate_up, _ = self.gate_up_proj(x) ++ x = self.act_fn(gate_up) ++ x, _ = self.down_proj(x) ++ return x ++ ++ ++class Qwen3MoeSparseMoeBlock(nn.Module): ++ ++ def __init__( ++ self, ++ config: PretrainedConfig, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ): ++ super().__init__() ++ self.tp_size = get_tensor_model_parallel_world_size() ++ ++ if self.tp_size > config.num_experts: ++ raise ValueError( ++ f"Tensor parallel size {self.tp_size} is greater than " ++ f"the number of experts {config.num_experts}.") ++ ++ self.experts = FusedMoE(num_experts=config.num_experts, ++ top_k=config.num_experts_per_tok, ++ hidden_size=config.hidden_size, ++ intermediate_size=config.moe_intermediate_size, ++ reduce_results=False, ++ renormalize=config.norm_topk_prob, ++ quant_config=quant_config, ++ prefix=f"{prefix}.experts") ++ ++ self.gate = ReplicatedLinear(config.hidden_size, ++ config.num_experts, ++ bias=False, ++ quant_config=None, ++ prefix=f"{prefix}.gate") ++ ++ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ++ # NOTE: hidden_states can have either 1D or 2D shape. ++ orig_shape = hidden_states.shape ++ hidden_dim = hidden_states.shape[-1] ++ hidden_states = hidden_states.view(-1, hidden_dim) ++ ++ # router_logits: (num_tokens, n_experts) ++ router_logits, _ = self.gate(hidden_states) ++ final_hidden_states = self.experts(hidden_states=hidden_states, ++ router_logits=router_logits) ++ final_hidden_states = final_hidden_states ++ if self.tp_size > 1: ++ final_hidden_states = tensor_model_parallel_all_reduce( ++ final_hidden_states) ++ ++ return final_hidden_states.view(orig_shape) ++ ++ ++class Qwen3MoeAttention(nn.Module): ++ ++ def __init__( ++ self, ++ hidden_size: int, ++ num_heads: int, ++ num_kv_heads: int, ++ rope_theta: float = 10000, ++ rope_scaling: Optional[Dict[str, Any]] = None, ++ max_position_embeddings: int = 8192, ++ head_dim: Optional[int] = None, ++ rms_norm_eps: float = 1e-06, ++ qkv_bias: bool = False, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = hidden_size ++ tp_size = get_tensor_model_parallel_world_size() ++ self.total_num_heads = num_heads ++ assert self.total_num_heads % tp_size == 0 ++ self.num_heads = self.total_num_heads // tp_size ++ self.total_num_kv_heads = num_kv_heads ++ if self.total_num_kv_heads >= tp_size: ++ # Number of KV heads is greater than TP size, so we partition ++ # the KV heads across multiple tensor parallel GPUs. ++ assert self.total_num_kv_heads % tp_size == 0 ++ else: ++ # Number of KV heads is less than TP size, so we replicate ++ # the KV heads across multiple tensor parallel GPUs. ++ assert tp_size % self.total_num_kv_heads == 0 ++ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) ++ self.head_dim = head_dim or (hidden_size // self.total_num_heads) ++ self.q_size = self.num_heads * self.head_dim ++ self.kv_size = self.num_kv_heads * self.head_dim ++ self.scaling = self.head_dim**-0.5 ++ self.rope_theta = rope_theta ++ self.max_position_embeddings = max_position_embeddings ++ ++ self.qkv_proj = QKVParallelLinear(hidden_size, ++ self.head_dim, ++ self.total_num_heads, ++ self.total_num_kv_heads, ++ bias=qkv_bias, ++ quant_config=quant_config, ++ prefix=f"{prefix}.qkv_proj") ++ ++ self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.o_proj") ++ ++ self.rotary_emb = get_rope( ++ self.head_dim, ++ rotary_dim=self.head_dim, ++ max_position=max_position_embeddings, ++ base=rope_theta, ++ rope_scaling=rope_scaling, ++ ) ++ self.attn = Attention(self.num_heads, ++ self.head_dim, ++ self.scaling, ++ num_kv_heads=self.num_kv_heads, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.attn") ++ ++ self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ ) -> torch.Tensor: ++ qkv, _ = self.qkv_proj(hidden_states) ++ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ++ # Add qk-norm ++ q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, ++ self.head_dim) ++ q_by_head = self.q_norm.forward_native(q_by_head) ++ q = q_by_head.view(q.shape) ++ ++ k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, ++ self.head_dim) ++ k_by_head = self.k_norm.forward_native(k_by_head) ++ k = k_by_head.view(k.shape) ++ q, k = self.rotary_emb(positions, q, k) ++ attn_output = self.attn(q, k, v) ++ output, _ = self.o_proj(attn_output) ++ return output ++ ++ ++class Qwen3MoeDecoderLayer(nn.Module): ++ ++ def __init__( ++ self, ++ config: PretrainedConfig, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = config.hidden_size ++ rope_theta = getattr(config, "rope_theta", 10000) ++ rope_scaling = getattr(config, "rope_scaling", None) ++ max_position_embeddings = getattr(config, "max_position_embeddings", ++ 8192) ++ self.self_attn = Qwen3MoeAttention( ++ hidden_size=self.hidden_size, ++ num_heads=config.num_attention_heads, ++ num_kv_heads=config.num_key_value_heads, ++ rope_theta=rope_theta, ++ rope_scaling=rope_scaling, ++ max_position_embeddings=max_position_embeddings, ++ rms_norm_eps=config.rms_norm_eps, ++ qkv_bias=getattr(config, 'attention_bias', False), ++ head_dim=getattr(config, 'head_dim', None), ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.self_attn", ++ ) ++ ++ # `mlp_only_layers` in the config. ++ layer_idx = extract_layer_index(prefix) ++ mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else ++ config.mlp_only_layers) ++ if (layer_idx not in mlp_only_layers) and ( ++ config.num_experts > 0 and ++ (layer_idx + 1) % config.decoder_sparse_step == 0): ++ self.mlp = Qwen3MoeSparseMoeBlock(config=config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp") ++ else: ++ self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, ++ intermediate_size=config.intermediate_size, ++ hidden_act=config.hidden_act, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp") ++ self.input_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_attention_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ residual: Optional[torch.Tensor], ++ ) -> torch.Tensor: ++ # Self Attention ++ if residual is None: ++ residual = hidden_states ++ hidden_states = self.input_layernorm(hidden_states) ++ else: ++ hidden_states, residual = self.input_layernorm( ++ hidden_states, residual) ++ hidden_states = self.self_attn( ++ positions=positions, ++ hidden_states=hidden_states, ++ ) ++ ++ # Fully Connected ++ hidden_states, residual = self.post_attention_layernorm( ++ hidden_states, residual) ++ hidden_states = self.mlp(hidden_states) ++ return hidden_states, residual ++ ++ ++@support_torch_compile ++class Qwen3MoeModel(nn.Module): ++ ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ ++ config = vllm_config.model_config.hf_config ++ cache_config = vllm_config.cache_config ++ quant_config = vllm_config.quant_config ++ ++ self.padding_idx = config.pad_token_id ++ self.vocab_size = config.vocab_size ++ ++ self.embed_tokens = VocabParallelEmbedding( ++ config.vocab_size, ++ config.hidden_size, ++ prefix=f"{prefix}.embed_tokens") ++ self.start_layer, self.end_layer, self.layers = make_layers( ++ config.num_hidden_layers, ++ lambda prefix: Qwen3MoeDecoderLayer(config=config, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=prefix), ++ prefix=f"{prefix}.layers", ++ ) ++ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) ++ self.make_empty_intermediate_tensors = ( ++ make_empty_intermediate_tensors_factory( ++ ["hidden_states", "residual"], config.hidden_size)) ++ ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.embed_tokens(input_ids) ++ ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ if get_pp_group().is_first_rank: ++ if inputs_embeds is not None: ++ hidden_states = inputs_embeds ++ else: ++ hidden_states = self.get_input_embeddings(input_ids) ++ residual = None ++ else: ++ assert intermediate_tensors is not None ++ hidden_states = intermediate_tensors["hidden_states"] ++ residual = intermediate_tensors["residual"] ++ for i in range(self.start_layer, self.end_layer): ++ layer = self.layers[i] ++ hidden_states, residual = layer(positions, hidden_states, residual) ++ if not get_pp_group().is_last_rank: ++ return IntermediateTensors({ ++ "hidden_states": hidden_states, ++ "residual": residual ++ }) ++ hidden_states, _ = self.norm(hidden_states, residual) ++ return hidden_states ++ ++ ++class Qwen3MoeForCausalLM(nn.Module, SupportsPP): ++ ++ fall_back_to_pt_during_load = False ++ ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ config = vllm_config.model_config.hf_config ++ quant_config = vllm_config.quant_config ++ self.config = config ++ self.quant_config = quant_config ++ self.model = Qwen3MoeModel(vllm_config=vllm_config, ++ prefix=maybe_prefix(prefix, "model")) ++ self.lm_head = ParallelLMHead(config.vocab_size, ++ config.hidden_size, ++ quant_config=quant_config) ++ if self.config.tie_word_embeddings: ++ self.lm_head.weight = self.model.embed_tokens.weight ++ self.logits_processor = LogitsProcessor(config.vocab_size) ++ self.sampler = get_sampler() ++ self.make_empty_intermediate_tensors = ( ++ self.model.make_empty_intermediate_tensors) ++ ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.model.get_input_embeddings(input_ids) ++ ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ hidden_states = self.model(input_ids, positions, intermediate_tensors, ++ inputs_embeds) ++ return hidden_states ++ ++ def compute_logits( ++ self, ++ hidden_states: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[torch.Tensor]: ++ logits = self.logits_processor(self.lm_head, hidden_states, ++ sampling_metadata) ++ return logits ++ ++ def sample( ++ self, ++ logits: Optional[torch.Tensor], ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[SamplerOutput]: ++ next_tokens = self.sampler(logits, sampling_metadata) ++ return next_tokens ++ ++ def load_weights(self, weights: Iterable[Tuple[str, ++ torch.Tensor]]) -> Set[str]: ++ stacked_params_mapping = [ ++ # (param_name, shard_name, shard_id) ++ ("qkv_proj", "q_proj", "q"), ++ ("qkv_proj", "k_proj", "k"), ++ ("qkv_proj", "v_proj", "v"), ++ ("gate_up_proj", "gate_proj", 0), ++ ("gate_up_proj", "up_proj", 1), ++ ] ++ ++ # Params for weights, fp8 weight scales, fp8 activation scales ++ # (param_name, weight_name, expert_id, shard_id) ++ expert_params_mapping = FusedMoE.make_expert_params_mapping( ++ ckpt_gate_proj_name="gate_proj", ++ ckpt_down_proj_name="down_proj", ++ ckpt_up_proj_name="up_proj", ++ num_experts=self.config.num_experts) ++ ++ params_dict = dict(self.named_parameters()) ++ loaded_params: Set[str] = set() ++ for name, loaded_weight in weights: ++ if "rotary_emb.inv_freq" in name: ++ continue ++ for (param_name, weight_name, shard_id) in stacked_params_mapping: ++ # Skip non-stacked layers and experts (experts handled below). ++ if weight_name not in name: ++ continue ++ # We have mlp.experts[0].gate_proj in the checkpoint. ++ # Since we handle the experts below in expert_params_mapping, ++ # we need to skip here BEFORE we update the name, otherwise ++ # name will be updated to mlp.experts[0].gate_up_proj, which ++ # will then be updated below in expert_params_mapping ++ # for mlp.experts[0].gate_gate_up_proj, which breaks load. ++ if "mlp.experts" in name: ++ continue ++ name = name.replace(weight_name, param_name) ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ if name not in params_dict: ++ continue ++ ++ param = params_dict[name] ++ weight_loader = param.weight_loader ++ weight_loader(param, loaded_weight, shard_id) ++ break ++ else: ++ for mapping in expert_params_mapping: ++ param_name, weight_name, expert_id, shard_id = mapping ++ if weight_name not in name: ++ continue ++ name = name.replace(weight_name, param_name) ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ param = params_dict[name] ++ weight_loader = param.weight_loader ++ weight_loader(param, ++ loaded_weight, ++ name, ++ shard_id=shard_id, ++ expert_id=expert_id) ++ break ++ else: ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ # Remapping the name of FP8 kv-scale. ++ if name.endswith("kv_scale"): ++ remapped_kv_scale_name = name.replace( ++ ".kv_scale", ".attn.kv_scale") ++ if remapped_kv_scale_name not in params_dict: ++ logger.warning_once( ++ "Found kv scale in the checkpoint " ++ f"(e.g. {name}), but not found the expected " ++ f"name in the model " ++ f"(e.g. {remapped_kv_scale_name}). " ++ "kv-scale is not loaded.") ++ continue ++ else: ++ name = remapped_kv_scale_name ++ param = params_dict[name] ++ weight_loader = getattr(param, "weight_loader", ++ default_weight_loader) ++ weight_loader(param, loaded_weight) ++ loaded_params.add(name) ++ return loaded_params +diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py +index c0a3c59ba..d63750e03 100644 +--- a/vllm/model_executor/models/registry.py ++++ b/vllm/model_executor/models/registry.py +@@ -101,6 +101,8 @@ _TEXT_GENERATION_MODELS = { + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), + "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), + "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), ++ "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"), ++ "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"), + "RWForCausalLM": ("falcon", "FalconForCausalLM"), + "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), + "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), +@@ -140,6 +142,7 @@ _EMBEDDING_MODELS = { + "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), + "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), ++ "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), + # [Multimodal] + "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 + "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), +diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py +index a38035e37..379e19e1b 100644 +--- a/vllm/model_executor/models/telechat2.py ++++ b/vllm/model_executor/models/telechat2.py +@@ -22,11 +22,13 @@ + from typing import Iterable, Set, Tuple + + import torch ++import torch.nn as nn + + from vllm.config import VllmConfig + from vllm.model_executor.model_loader.weight_utils import default_weight_loader + from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel + ++from .llama import LlamaDecoderLayer + from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + is_pp_missing_parameter) + +@@ -120,7 +122,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): + }, + ) + +- def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): ++ def _init_model(self, ++ vllm_config: VllmConfig, ++ prefix: str = "", ++ layer_type: type[nn.Module] = LlamaDecoderLayer): + return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) + + def load_weights(self, weights: Iterable[Tuple[str, +diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py +index 81d72ff19..19c1cc1c7 100644 +--- a/vllm/multimodal/inputs.py ++++ b/vllm/multimodal/inputs.py +@@ -752,6 +752,11 @@ class MultiModalInputs(TypedDict): + :code:`prompt_token_ids`. + """ + ++ cache_salt: NotRequired[str] ++ """ ++ Optional cache salt to be used for prefix caching. ++ """ ++ + + class MultiModalEncDecInputs(MultiModalInputs): + """ +diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py +index c8864c33f..26b225763 100644 +--- a/vllm/multimodal/processing.py ++++ b/vllm/multimodal/processing.py +@@ -1649,7 +1649,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): + mm_data: MultiModalDataDict, + ) -> Union[str, list[int]]: + """ +- Create input prompt for the encoder. HF processor will be applied on ++ Create input prompt for the encoder. HF processor will be applied on + this prompt during profiling and generation. + """ + raise NotImplementedError +diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py +index b6f6029de..7503af993 100644 +--- a/vllm/platforms/interface.py ++++ b/vllm/platforms/interface.py +@@ -41,6 +41,7 @@ class _Backend(enum.Enum): + PALLAS = enum.auto() + PALLAS_VLLM_V1 = enum.auto() + IPEX = enum.auto() ++ IPEX_V1 = enum.auto() + BLOCK_SPARSE_FLASH_ATTN = enum.auto() + NO_ATTENTION = enum.auto() + +diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py +index 225e756cd..ad7d56c79 100644 +--- a/vllm/platforms/xpu.py ++++ b/vllm/platforms/xpu.py +@@ -4,13 +4,15 @@ from typing import TYPE_CHECKING, Optional + + import torch + ++import vllm.envs as envs + from vllm.logger import init_logger + + from .interface import DeviceCapability, Platform, PlatformEnum, _Backend + + if TYPE_CHECKING: +- from vllm.config import VllmConfig ++ from vllm.config import ModelConfig, VllmConfig + else: ++ ModelConfig = None + VllmConfig = None + + logger = init_logger(__name__) +@@ -31,10 +33,22 @@ class XPUPlatform(Platform): + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: +- if selected_backend != _Backend.IPEX: +- logger.info("Cannot use %s backend on XPU.", selected_backend) +- logger.info("Using IPEX attention backend.") +- return "vllm.attention.backends.ipex_attn.IpexAttnBackend" ++ if selected_backend not in [_Backend.IPEX, _Backend.IPEX_V1]: ++ logger.warning_once( ++ f"Cannot use {selected_backend} backend on XPU.") ++ use_v1 = envs.VLLM_USE_V1 ++ if use_v1: ++ if selected_backend == _Backend.IPEX: ++ logger.warning_once("For v1 on XPU, should use " ++ "IPEX_V1 attention backend.") ++ logger.info_once("Using IPEX_V1 attention backend.") ++ return "vllm.v1.attention.backends.ipex_attn.IPEXAttentionBackend" ++ else: ++ if selected_backend == _Backend.IPEX: ++ logger.warning_once("For v0 on XPU, should use " ++ "IPEX attention backend.") ++ logger.info_once("Using IPEX attention backend.") ++ return "vllm.attention.backends.ipex_attn.IpexAttnBackend" + + @staticmethod + def get_device_capability( +@@ -47,6 +61,10 @@ class XPUPlatform(Platform): + def get_device_name(device_id: int = 0) -> str: + return torch.xpu.get_device_name(device_id) + ++ @classmethod ++ def get_punica_wrapper(cls) -> str: ++ return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" ++ + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + device_props = torch.xpu.get_device_properties(device_id) +@@ -64,49 +82,50 @@ class XPUPlatform(Platform): + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: +- cache_config.block_size = 16 ++ if envs.VLLM_USE_V1: ++ cache_config.block_size = 64 ++ else: ++ cache_config.block_size = 16 + + # check and update model config + model_config = vllm_config.model_config +- if model_config.dtype == torch.bfloat16: +- bf16_supported = cls.device_support_bf16() +- if not bf16_supported: +- logger.warning( +- "bfloat16 is only supported on Intel Data Center GPU, " +- "Intel Arc GPU is not supported yet. Your device is %s," +- " which is not supported. will fallback to float16", +- cls.get_device_name()) +- model_config.dtype = torch.float16 + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on XPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True + +- if vllm_config.speculative_config is not None: +- raise NotImplementedError( +- "XPU does not support speculative decoding") +- + if vllm_config.device_config is not None: + assert vllm_config.device_config.device_type == "xpu" + + # check and update parallel config + parallel_config = vllm_config.parallel_config +- if parallel_config.worker_cls == "auto": ++ if vllm_config.speculative_config: ++ if envs.VLLM_USE_V1: ++ parallel_config.worker_cls = \ ++ "vllm.v1.worker.gpu_worker.Worker" ++ else: ++ raise NotImplementedError( ++ "XPU v0 does not support speculative decoding") ++ if envs.VLLM_USE_V1: ++ parallel_config.worker_cls =\ ++ "vllm.v1.worker.xpu_worker.XPUWorker" ++ else: + parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + if parallel_config.distributed_executor_backend is None: +- parallel_config.distributed_executor_backend = "ray" ++ if parallel_config.world_size > 1: ++ parallel_config.distributed_executor_backend = "ray" ++ else: ++ parallel_config.distributed_executor_backend = "uni" + elif parallel_config.distributed_executor_backend == "mp": + # FIXME(kunshang): + # spawn needs calling `if __name__ == '__main__':`` + # fork is not supported for xpu start new process. +- logger.error( +- "Both start methods (spawn and fork) have issue " +- "on XPU if you use mp backend, setting it to ray instead.") +- parallel_config.distributed_executor_backend = "ray" +- +- elif parallel_config.distributed_executor_backend != "ray": ++ logger.warning( ++ "Please use spawn as start method if you want to use mp.") ++ elif parallel_config.distributed_executor_backend != "ray" and \ ++ parallel_config.distributed_executor_backend != "uni": + logger.warning( + "%s is not supported on XPU, fallback to ray distributed" + " executor backend.", +@@ -115,8 +134,7 @@ class XPUPlatform(Platform): + + @classmethod + def is_pin_memory_available(cls): +- logger.warning("Pin memory is not supported on XPU.") +- return False ++ return True + + @classmethod + def get_current_memory_usage(cls, +@@ -126,17 +144,21 @@ class XPUPlatform(Platform): + return torch.xpu.max_memory_allocated(device) + + @classmethod +- def device_support_bf16(cls) -> bool: +- device_name = cls.get_device_name().lower() +- if device_name.count("arc") > 0: +- return False +- elif device_name.count("data center gpu") > 0: +- return True ++ def fp8_dtype(cls) -> torch.dtype: ++ if envs.VLLM_XPU_FP8_DTYPE == "e4m3": ++ return torch.float8_e4m3fn + else: +- logger.warning("Unknown device name %s, always use float16", +- device_name) +- return False ++ return torch.float8_e5m2 ++ ++ @classmethod ++ def is_data_center_gpu(cls) -> bool: ++ device_name = cls.get_device_name().lower() ++ return device_name.count("data center gpu") > 0 + + @classmethod + def get_device_communicator_cls(cls) -> str: + return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa ++ ++ @classmethod ++ def supports_v1(cls, model_config: ModelConfig) -> bool: ++ return True +diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py +index a20700248..54cb182d1 100644 +--- a/vllm/triton_utils/importing.py ++++ b/vllm/triton_utils/importing.py +@@ -9,7 +9,7 @@ logger = init_logger(__name__) + + HAS_TRITON = ( + find_spec("triton") is not None +- and not current_platform.is_xpu() # Not compatible ++ or find_spec("pytorch-triton-xpu") is not None # Not compatible + ) + + if not HAS_TRITON: +diff --git a/vllm/utils.py b/vllm/utils.py +index 5f32f8cb6..7158b9270 100644 +--- a/vllm/utils.py ++++ b/vllm/utils.py +@@ -154,9 +154,9 @@ STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, +- "fp8": torch.uint8, +- "fp8_e4m3": torch.uint8, +- "fp8_e5m2": torch.uint8, ++ "fp8": torch.float8_e4m3fn, ++ "fp8_e4m3": torch.float8_e4m3fn, ++ "fp8_e5m2": torch.float8_e5m2, + "int8": torch.int8, + } + +@@ -2407,7 +2407,8 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]: + src = inspect.getsource(attr_func) + if "NotImplementedError" in src: + unimplemented_methods.append(attr_name) +- if unimplemented_methods: ++ from vllm.platforms import current_platform ++ if unimplemented_methods and not current_platform.is_xpu(): + method_names = ','.join(unimplemented_methods) + msg = (f"Methods {method_names} not implemented in {self}") + logger.warning(msg) +diff --git a/vllm/v1/attention/backends/ipex_attn.py b/vllm/v1/attention/backends/ipex_attn.py +new file mode 100644 +index 000000000..d252d90eb +--- /dev/null ++++ b/vllm/v1/attention/backends/ipex_attn.py +@@ -0,0 +1,174 @@ ++# SPDX-License-Identifier: Apache-2.0 ++from dataclasses import dataclass ++from typing import Any, Optional ++ ++import torch ++ ++from vllm._ipex_ops import ipex_ops ++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ++ AttentionLayer, ++ AttentionMetadata, AttentionType) ++from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata ++ ++ ++@dataclass ++class IPEXAttentionMetadata(FlashAttentionMetadata): ++ seq_start_loc: torch.Tensor = torch.tensor([0], dtype=torch.int64) ++ ++ ++class IPEXAttentionBackend(AttentionBackend): ++ ++ accept_output_buffer: bool = True ++ ++ @staticmethod ++ def get_supported_head_sizes() -> list[int]: ++ return [32, 64, 80, 96, 128, 160, 192, 224, 256] ++ ++ @staticmethod ++ def get_name() -> str: ++ return "IPEX_V1" ++ ++ @staticmethod ++ def get_impl_cls() -> type["IPEXAttentionImpl"]: ++ return IPEXAttentionImpl ++ ++ @staticmethod ++ def get_metadata_cls() -> type["AttentionMetadata"]: ++ return IPEXAttentionMetadata ++ ++ @staticmethod ++ def get_kv_cache_shape( ++ num_blocks: int, ++ block_size: int, ++ num_kv_heads: int, ++ head_size: int, ++ ) -> tuple[int, ...]: ++ if block_size % 16 != 0: ++ raise ValueError("Block size must be a multiple of 16.") ++ return (2, num_blocks, block_size, num_kv_heads, head_size) ++ ++ @staticmethod ++ def use_cascade_attention(*args, **kwargs) -> bool: ++ # TODO: support cascade attention ++ return False ++ ++ ++class IPEXAttentionImpl(AttentionImpl): ++ ++ def __init__( ++ self, ++ num_heads: int, ++ head_size: int, ++ scale: float, ++ num_kv_heads: int, ++ alibi_slopes: Optional[list[float]], ++ sliding_window: Optional[int], ++ kv_cache_dtype: str, ++ blocksparse_params: Optional[dict[str, Any]] = None, ++ logits_soft_cap: Optional[float] = None, ++ attn_type: str = AttentionType.DECODER, ++ ) -> None: ++ if blocksparse_params is not None: ++ raise ValueError( ++ "FlashAttention does not support block-sparse attention.") ++ self.num_heads = num_heads ++ self.head_size = head_size ++ self.scale = float(scale) ++ self.num_kv_heads = num_kv_heads ++ if alibi_slopes is not None: ++ alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) ++ self.alibi_slopes = alibi_slopes ++ if sliding_window is None: ++ self.sliding_window = (-1, -1) ++ else: ++ self.sliding_window = (sliding_window - 1, 0) ++ self.kv_cache_dtype = kv_cache_dtype ++ if logits_soft_cap is None: ++ # In flash-attn, setting logits_soft_cap as 0 means no soft cap. ++ logits_soft_cap = 0 ++ self.logits_soft_cap = logits_soft_cap ++ ++ assert self.num_heads % self.num_kv_heads == 0 ++ self.num_queries_per_kv = self.num_heads // self.num_kv_heads ++ ++ support_head_sizes = IPEXAttentionBackend.get_supported_head_sizes() ++ if head_size not in support_head_sizes: ++ raise ValueError( ++ f"Head size {head_size} is not supported by FlashAttention. " ++ f"Supported head sizes are: {support_head_sizes}.") ++ if attn_type != AttentionType.DECODER: ++ raise NotImplementedError("Encoder self-attention and " ++ "encoder/decoder cross-attention " ++ "are not implemented for " ++ "IpexAttnBackendImpl") ++ ++ def forward( ++ self, ++ layer: AttentionLayer, ++ query: torch.Tensor, ++ key: torch.Tensor, ++ value: torch.Tensor, ++ kv_cache: torch.Tensor, ++ attn_metadata: IPEXAttentionBackend, ++ output: Optional[torch.Tensor] = None, ++ ) -> torch.Tensor: ++ """Forward pass with IPEXAttention. ++ Args: ++ query: shape = [num_tokens, num_heads * head_size] ++ key: shape = [num_tokens, num_kv_heads * head_size] ++ value: shape = [num_tokens, num_kv_heads * head_size] ++ kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] ++ attn_metadata: Metadata for attention. ++ Returns: ++ shape = [num_tokens, num_heads * head_size] ++ """ ++ assert output is not None, "Output tensor must be provided." ++ if attn_metadata is None: ++ # Profiling run. ++ return output.random_(0, 10) ++ ++ num_actual_tokens = attn_metadata.num_actual_tokens ++ num_heads = self.num_heads ++ head_size = self.head_size ++ num_kv_heads = self.num_kv_heads ++ query = query.view(-1, num_heads, head_size) ++ key = key.view(-1, num_kv_heads, head_size) ++ value = value.view(-1, num_kv_heads, head_size) ++ ++ # Reshape the input keys and values and store them in the cache. ++ key_cache, value_cache = kv_cache.unbind(0) ++ ++ ipex_ops.reshape_and_cache_flash( ++ key[:num_actual_tokens], ++ value[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ attn_metadata.slot_mapping, ++ self.kv_cache_dtype, ++ layer._k_scale_float, ++ layer._v_scale_float, ++ ) ++ ++ ipex_ops.chunked_prefill( ++ query[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ output[:num_actual_tokens], ++ attn_metadata.query_start_loc, ++ attn_metadata.seq_start_loc, ++ None, ++ attn_metadata.block_table, ++ self.alibi_slopes, ++ attn_metadata.max_query_len, ++ attn_metadata.max_seq_len, ++ 0.0, ++ self.scale, ++ False, ++ self.sliding_window[0], ++ self.sliding_window[1], ++ True, ++ False, ++ None, ++ self.kv_cache_dtype, ++ ) ++ return output +diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py +index 34bc9369b..d11771807 100644 +--- a/vllm/v1/core/kv_cache_utils.py ++++ b/vllm/v1/core/kv_cache_utils.py +@@ -275,7 +275,10 @@ def need_extra_keys(request: Request) -> bool: + + # Multimodal requests need to include the MM hash. + # LoRA requests need to include the LoRA ID. +- return bool(request.mm_positions) or (request.lora_request is not None) ++ # Request with provided cache salt need to include the salt. ++ return bool(request.mm_positions) or (request.lora_request ++ is not None) or (request.cache_salt ++ is not None) + + + def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, +@@ -381,8 +384,10 @@ def generate_block_hash_extra_keys( + mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys( + request, start_token_idx, end_token_idx, start_mm_idx) + lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request) ++ cache_salt_keys: list[str] = [request.cache_salt] if ( ++ start_token_idx == 0 and request.cache_salt) else [] + +- extra_keys: list[Any] = lora_extra_keys + mm_extra_keys ++ extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys + + if not extra_keys: + return None, new_start_mm_idx +@@ -601,10 +606,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, + def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): + """ + Only models with one type of KV cache are supported yet. This function tries +- to convert the KV cache specs to one type if the model is a hybrid model ++ to convert the KV cache specs to one type if the model is a hybrid model + with multiple type of KV cache. It will convert all SlidingWindowSpec to + FullAttentionSpec if both types are present. +- ++ + Args: + kv_cache_spec: The kv cache spec of each attention layer in the model + """ +diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py +index 0557d0c6c..c7a51030a 100644 +--- a/vllm/v1/engine/__init__.py ++++ b/vllm/v1/engine/__init__.py +@@ -59,6 +59,7 @@ class EngineCoreRequest( + eos_token_id: Optional[int] + arrival_time: float + lora_request: Optional[LoRARequest] ++ cache_salt: Optional[str] + + + class EngineCoreEventType(enum.IntEnum): +diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py +index 0d2892837..dbc0648d7 100644 +--- a/vllm/v1/engine/processor.py ++++ b/vllm/v1/engine/processor.py +@@ -278,6 +278,7 @@ class Processor: + eos_token_id=eos_token_id, + arrival_time=arrival_time, + lora_request=lora_request, ++ cache_salt=decoder_inputs.get("cache_salt"), + ) + + def _validate_model_inputs(self, +diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py +index e3a4cd98c..79fd179b6 100644 +--- a/vllm/v1/executor/abstract.py ++++ b/vllm/v1/executor/abstract.py +@@ -35,9 +35,15 @@ class Executor(ExecutorBase): + f"ExecutorBase. Got {distributed_executor_backend}.") + executor_class = distributed_executor_backend + elif distributed_executor_backend == "ray": +- from vllm.v1.executor.ray_distributed_executor import ( # noqa +- RayDistributedExecutor) +- executor_class = RayDistributedExecutor ++ from vllm.platforms import current_platform ++ if current_platform.is_xpu(): ++ from vllm.v1.executor.ray_distributed_executor import ( # noqa ++ XPURayDistributedExecutor) ++ executor_class = XPURayDistributedExecutor ++ else: ++ from vllm.v1.executor.ray_distributed_executor import ( # noqa ++ RayDistributedExecutor) ++ executor_class = RayDistributedExecutor + elif distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import MultiprocExecutor + executor_class = MultiprocExecutor +diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py +index 320ebfd37..a2b199a86 100644 +--- a/vllm/v1/executor/ray_distributed_executor.py ++++ b/vllm/v1/executor/ray_distributed_executor.py +@@ -59,3 +59,22 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + # When PP is used, we return a FutureWrapper immediately so that + # the scheduler can yield to the next batch. + return FutureWrapper(refs[0]) ++ ++ ++class XPURayDistributedExecutor(RayDistributedExecutor, Executor): ++ """XPU Ray distributed executor without Compiled Graphs.""" ++ ++ def execute_model( ++ self, ++ scheduler_output, ++ ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: ++ output = self.collective_rpc("execute_model", ++ args=(scheduler_output, )) ++ return output[0] ++ ++ @property ++ def max_concurrent_batches(self) -> int: ++ """Ray distributed executor supports pipeline parallelism, ++ meaning that it allows PP size batches to be executed concurrently. ++ """ ++ return 1 +diff --git a/vllm/v1/request.py b/vllm/v1/request.py +index 490fe4e83..7fc662b89 100644 +--- a/vllm/v1/request.py ++++ b/vllm/v1/request.py +@@ -31,6 +31,7 @@ class Request: + arrival_time: float, + lora_request: Optional["LoRARequest"] = None, + structured_output_request: Optional["StructuredOutputRequest"] = None, ++ cache_salt: Optional[str] = None, + ) -> None: + self.request_id = request_id + self.sampling_params = sampling_params +@@ -54,6 +55,7 @@ class Request: + self._all_token_ids: list[int] = self.prompt_token_ids.copy() + self.spec_token_ids: list[int] = [] + self.num_computed_tokens = 0 ++ self.cache_salt: Optional[str] = cache_salt + + # Multi-modal related + self.mm_positions = multi_modal_placeholders or [] +@@ -88,6 +90,7 @@ class Request: + lora_request=request.lora_request, + structured_output_request=StructuredOutputRequest( + sampling_params=request.sampling_params), ++ cache_salt=request.cache_salt, + ) + + def append_output_token_ids( +diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py +index 783a33481..83f2c6436 100644 +--- a/vllm/v1/structured_output/backend_xgrammar.py ++++ b/vllm/v1/structured_output/backend_xgrammar.py +@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING + + import torch + ++import vllm.envs + from vllm.config import VllmConfig + from vllm.logger import init_logger + from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +@@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend): + tokenizer, + vocab_size=self.vocab_size, + ) +- self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8) ++ self.compiler = xgr.GrammarCompiler( ++ tokenizer_info, ++ max_threads=8, ++ cache_enabled=True, ++ cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024, ++ ) + + def compile_grammar(self, request_type: StructuredOutputOptions, + grammar_spec: str) -> StructuredOutputGrammar: +diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py +index 5133c637f..88887dfe0 100644 +--- a/vllm/v1/worker/gpu_model_runner.py ++++ b/vllm/v1/worker/gpu_model_runner.py +@@ -21,6 +21,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding + from vllm.model_executor.model_loader import get_model + from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs + from vllm.multimodal.utils import group_mm_inputs_by_modality ++from vllm.platforms import current_platform + from vllm.sampling_params import SamplingType + from vllm.sequence import IntermediateTensors + from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, +@@ -52,6 +53,9 @@ if TYPE_CHECKING: + else: + xgr = LazyLoader("xgr", globals(), "xgrammar") + ++if current_platform.is_xpu(): ++ import intel_extension_for_pytorch ++ + logger = init_logger(__name__) + + +@@ -1213,6 +1217,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): + # in the next step. + del draft_probs + ++ if current_platform.is_xpu(): ++ reserved_mem = torch.xpu.memory_reserved() ++ if reserved_mem >= self.vllm_config.cache_config.threshold_mem: ++ torch.xpu.empty_cache() ++ + return ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, +diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py +new file mode 100644 +index 000000000..f3f56d9c3 +--- /dev/null ++++ b/vllm/v1/worker/xpu_model_runner.py +@@ -0,0 +1,416 @@ ++# SPDX-License-Identifier: Apache-2.0 ++import gc ++from typing import TYPE_CHECKING ++ ++import numpy as np ++import torch ++ ++from vllm.attention import get_attn_backend ++from vllm.config import CompilationLevel, VllmConfig ++from vllm.distributed.parallel_state import get_pp_group ++from vllm.inputs import INPUT_REGISTRY ++from vllm.logger import init_logger ++from vllm.multimodal import MULTIMODAL_REGISTRY ++from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, ++ check_use_alibi, is_pin_memory_available) ++from vllm.v1.attention.backends.ipex_attn import IPEXAttentionMetadata ++from vllm.v1.core.encoder_cache_manager import compute_encoder_budget ++from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch ++from vllm.v1.sample.rejection_sampler import RejectionSampler ++from vllm.v1.spec_decode.ngram_proposer import NgramProposer ++from vllm.v1.sample.metadata import SamplingMetadata ++from vllm.v1.worker.gpu_model_runner import GPUModelRunner ++ ++if TYPE_CHECKING: ++ from vllm.v1.core.scheduler import SchedulerOutput ++ ++logger = init_logger(__name__) ++ ++ ++class XPUModelRunner(GPUModelRunner): ++ """A model runner for XPU devices.""" ++ ++ def __init__( ++ self, ++ vllm_config: VllmConfig, ++ device: torch.device, ++ ): ++ self.vllm_config = vllm_config ++ self.model_config = vllm_config.model_config ++ self.cache_config = vllm_config.cache_config ++ self.lora_config = vllm_config.lora_config ++ self.load_config = vllm_config.load_config ++ self.parallel_config = vllm_config.parallel_config ++ self.scheduler_config = vllm_config.scheduler_config ++ self.speculative_config = vllm_config.speculative_config ++ self.prompt_adapter_config = vllm_config.prompt_adapter_config ++ self.observability_config = vllm_config.observability_config ++ ++ model_config = self.model_config ++ cache_config = self.cache_config ++ scheduler_config = self.scheduler_config ++ parallel_config = self.parallel_config ++ self.device = device ++ self.pin_memory = is_pin_memory_available() ++ self.dtype = self.model_config.dtype ++ if cache_config.cache_dtype == "auto": ++ self.kv_cache_dtype = self.dtype ++ else: ++ self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ ++ cache_config.cache_dtype] ++ ++ # NOTE(woosuk): sliding_window is None for models with interleaved ++ # attention. Use interleaved_sliding_window instead. ++ self.sliding_window = model_config.get_sliding_window() ++ self.interleaved_sliding_window = getattr( ++ model_config.hf_text_config, "interleaved_sliding_window", None) ++ self.window_size = (self.sliding_window ++ or self.interleaved_sliding_window) ++ ++ self.is_multimodal_model = model_config.is_multimodal_model ++ self.sliding_window = model_config.get_sliding_window() ++ self.block_size = cache_config.block_size ++ self.max_model_len = model_config.max_model_len ++ self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) ++ self.max_num_tokens = scheduler_config.max_num_batched_tokens ++ self.max_num_reqs = scheduler_config.max_num_seqs ++ ++ # Model-related. ++ self.num_attn_layers = model_config.get_num_layers_by_block_type( ++ parallel_config, LayerBlockType.attention) ++ self.num_query_heads = model_config.get_num_attention_heads( ++ parallel_config) ++ self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) ++ self.head_size = model_config.get_head_size() ++ self.hidden_size = model_config.get_hidden_size() ++ self.attn_backend = get_attn_backend( ++ self.head_size, ++ self.dtype, ++ self.kv_cache_dtype, ++ self.block_size, ++ self.model_config.is_attention_free, ++ use_mla=self.model_config.use_mla, ++ ) ++ if self.attn_backend is None: ++ error_msg = ( ++ f"Error with get_att_backend: {self.head_size=}, " ++ f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, " ++ f"{self.model_config.is_attention_free=}, " ++ f"{self.model_config.use_mla=}") ++ logger.error(error_msg) ++ raise NotImplementedError( ++ "Non-Attention backend is not supported by V1 GPUModelRunner.") ++ ++ # Multi-modal data support ++ self.input_registry = INPUT_REGISTRY ++ self.mm_registry = MULTIMODAL_REGISTRY ++ # FIXME: support mrope ++ self.uses_mrope = False ++ ++ encoder_compute_budget, encoder_cache_size = compute_encoder_budget( ++ model_config=model_config, ++ scheduler_config=scheduler_config, ++ mm_registry=self.mm_registry, ++ ) ++ self.max_num_encoder_input_tokens = encoder_compute_budget ++ self.encoder_cache_size = encoder_cache_size ++ ++ # Lazy initialization ++ # self.model: nn.Module # Set after load_model ++ self.kv_caches: list[torch.Tensor] = [] ++ # req_id -> (input_id -> encoder_output) ++ self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} ++ ++ # Set up speculative decoding. ++ self.use_spec_decode = False ++ if self.speculative_config: ++ self.use_spec_decode = True ++ if get_pp_group().is_last_rank: ++ if self.speculative_config.method == "ngram": ++ self.drafter = NgramProposer(self.vllm_config) ++ elif self.speculative_config.method == "eagle": ++ self.drafter = EagleProposer(self.vllm_config, ++ self.device) # type: ignore ++ else: ++ raise ValueError("Unknown speculative decoding method: " ++ f"{self.speculative_config.method}") ++ self.rejection_sampler = RejectionSampler() ++ ++ # Request states. ++ self.requests: dict[str, CachedRequestState] = {} ++ # Persistent batch. ++ self.input_batch = InputBatch( ++ max_num_reqs=self.max_num_reqs, ++ max_model_len=self.max_model_len, ++ max_num_blocks_per_req=self.max_num_blocks_per_req, ++ device=self.device, ++ pin_memory=self.pin_memory, ++ vocab_size=model_config.get_vocab_size(), ++ ) ++ ++ self.use_cuda_graph = (self.vllm_config.compilation_config.level ++ == CompilationLevel.PIECEWISE ++ and not self.model_config.enforce_eager) ++ # TODO(woosuk): Provide an option to tune the max cudagraph batch size. ++ # The convention is different. ++ # self.cudagraph_batch_sizes sorts in ascending order. ++ # The batch sizes in the config are in descending order. ++ self.cudagraph_batch_sizes = list( ++ reversed( ++ self.vllm_config.compilation_config.cudagraph_capture_sizes)) ++ ++ # Cache the device properties. ++ #self.device_properties = torch.cuda.get_device_properties(self.device) ++ #!!!hack this value doesn't matter ++ self.num_sms = 0 ++ # Persistent buffers for CUDA graphs. ++ self.input_ids = torch.zeros(self.max_num_tokens, ++ dtype=torch.int32, ++ device=self.device) ++ self.positions = torch.zeros(self.max_num_tokens, ++ dtype=torch.int64, ++ device=self.device) ++ # Only relevant for models using ALiBi (e.g, MPT) ++ self.use_alibi = check_use_alibi(model_config) ++ ++ self.inputs_embeds = torch.zeros( ++ (self.max_num_tokens, self.hidden_size), ++ dtype=self.dtype, ++ device=self.device) ++ ++ # OPTIMIZATION: Cache the tensors rather than creating them every step. ++ self.arange_np = np.arange(max(self.max_num_reqs + 1, ++ self.max_model_len, ++ self.max_num_tokens), ++ dtype=np.int32) ++ # NOTE(woosuk): These tensors are "stateless", i.e., they are literally ++ # a faster version of creating a new tensor every time. Thus, we should ++ # not make any assumptions about the values in these tensors. ++ self.input_ids_cpu = torch.zeros(self.max_num_tokens, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.input_ids_np = self.input_ids_cpu.numpy() ++ self.positions_cpu = torch.zeros(self.max_num_tokens, ++ dtype=torch.int64, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.positions_np = self.positions_cpu.numpy() ++ self.slot_mapping_cpu = torch.zeros(self.max_num_tokens, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.slot_mapping_np = self.slot_mapping_cpu.numpy() ++ self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.query_start_loc_np = self.query_start_loc_cpu.numpy() ++ self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() ++ self.seq_lens_cpu = torch.zeros(self.max_num_reqs, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.seq_lens_np = self.seq_lens_cpu.numpy() ++ ++ def _prepare_inputs( ++ self, ++ scheduler_output: "SchedulerOutput", ++ ) -> tuple[IPEXAttentionMetadata, torch.Tensor]: ++ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens ++ assert total_num_scheduled_tokens > 0 ++ num_reqs = self.input_batch.num_reqs ++ assert num_reqs > 0 ++ ++ # Some attention backends (namely MLA) may want to separate requests ++ # based on if the attention computation will be compute-bound or ++ # memory-bound. This gives them a hook to do that. ++ # modified_batch = self.attn_metadata_builder.reorder_batch( ++ # self.input_batch, scheduler_output) ++ # if modified_batch: ++ # self.input_batch.refresh_sampling_metadata() ++ ++ # OPTIMIZATION: Start copying the block table first. ++ # This way, we can overlap the copy with the following CPU operations. ++ self.input_batch.block_table.commit(num_reqs) ++ ++ # Get the number of scheduled tokens for each request. ++ req_ids = self.input_batch.req_ids ++ tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] ++ num_scheduled_tokens = np.array(tokens, dtype=np.int32) ++ max_num_scheduled_tokens = max(tokens) ++ ++ # Get request indices. ++ # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] ++ req_indices = np.repeat(self.arange_np[:num_reqs], ++ num_scheduled_tokens) ++ ++ # Get batched arange. ++ # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] ++ # Equivalent to but faster than: ++ # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) ++ # Step 1. [2, 5, 3] -> [2, 7, 10] ++ cu_num_tokens = np.cumsum(num_scheduled_tokens) ++ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] ++ cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, ++ num_scheduled_tokens) ++ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] ++ arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets ++ ++ # Get positions. ++ positions_np = self.positions_np[:total_num_scheduled_tokens] ++ np.add(self.input_batch.num_computed_tokens_cpu[req_indices], ++ arange, ++ out=positions_np) ++ ++ # Calculate M-RoPE positions. ++ # Only relevant for models using M-RoPE (e.g, Qwen2-VL) ++ if self.uses_mrope: ++ self._calc_mrope_positions(scheduler_output) ++ ++ # Get token indices. ++ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] ++ # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] ++ # where M is the max_model_len. ++ token_indices = (positions_np + ++ req_indices * self.input_batch.token_ids_cpu.shape[1]) ++ ++ # NOTE(woosuk): We use torch.index_select instead of np.take here ++ # because torch.index_select is much faster than np.take for large ++ # tensors. ++ torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), ++ 0, ++ torch.from_numpy(token_indices), ++ out=self.input_ids_cpu[:total_num_scheduled_tokens]) ++ ++ # Calculate the slot mapping. ++ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] ++ # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] ++ # where K is the max_num_blocks_per_req and the block size is 2. ++ # NOTE(woosuk): We can't simply use `token_indices // block_size` here ++ # because M (max_model_len) is not necessarily divisible by block_size. ++ block_table_indices = (req_indices * self.max_num_blocks_per_req + ++ positions_np // self.block_size) ++ # NOTE(woosuk): We use torch.index_select instead of np.take here ++ # because torch.index_select is much faster than np.take for large ++ # tensors. ++ block_table_cpu = self.input_batch.block_table.get_cpu_tensor() ++ block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() ++ block_offsets = positions_np % self.block_size ++ np.add(block_numbers * self.block_size, ++ block_offsets, ++ out=self.slot_mapping_np[:total_num_scheduled_tokens]) ++ ++ # Prepare the attention metadata. ++ self.query_start_loc_np[0] = 0 ++ self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens ++ ++ self.seq_lens_np[:num_reqs] = ( ++ self.input_batch.num_computed_tokens_cpu[:num_reqs] + ++ num_scheduled_tokens) ++ ++ # Copy the tensors to the GPU. ++ self.input_ids[:total_num_scheduled_tokens].copy_( ++ self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) ++ if self.uses_mrope: ++ # Only relevant for models using M-RoPE (e.g, Qwen2-VL) ++ self.mrope_positions[:, :total_num_scheduled_tokens].copy_( ++ self.mrope_positions_cpu[:, :total_num_scheduled_tokens], ++ non_blocking=True) ++ else: ++ # Common case (1D positions) ++ self.positions[:total_num_scheduled_tokens].copy_( ++ self.positions_cpu[:total_num_scheduled_tokens], ++ non_blocking=True) ++ ++ # Prepare for cascade attention if needed. ++ common_prefix_len = self._compute_cascade_attn_prefix_len( ++ num_scheduled_tokens, ++ scheduler_output.num_common_prefix_blocks, ++ ) ++ max_seq_len = self.seq_lens_np[:num_reqs].max() ++ query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( ++ self.device, non_blocking=True) ++ seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device, ++ non_blocking=True) ++ self.seq_start_loc_np[0] = 0 ++ np.cumsum(self.seq_lens_cpu[:num_reqs], out=self.seq_start_loc_np[1:num_reqs + 1]) ++ seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( ++ self.device, non_blocking=True) ++ block_table = ( ++ self.input_batch.block_table.get_device_tensor()[:num_reqs]) ++ slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( ++ self.device, non_blocking=True).long() ++ use_cascade = common_prefix_len > 0 ++ if use_cascade: ++ # TODO: Optimize. ++ cu_prefix_query_lens = torch.tensor([0, total_num_scheduled_tokens], ++ dtype=torch.int32, ++ device=self.device) ++ prefix_kv_lens = torch.tensor([common_prefix_len], ++ dtype=torch.int32, ++ device=self.device) ++ suffix_kv_lens = (self.seq_lens_np[:num_reqs] - ++ common_prefix_len) ++ suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to( ++ self.device) ++ else: ++ cu_prefix_query_lens = None ++ prefix_kv_lens = None ++ suffix_kv_lens = None ++ attn_metadata = IPEXAttentionMetadata( ++ num_actual_tokens=total_num_scheduled_tokens, ++ max_query_len=max_num_scheduled_tokens, ++ query_start_loc=query_start_loc, ++ max_seq_len=max_seq_len, ++ seq_start_loc=seq_start_loc, ++ seq_lens=torch.empty(0, dtype=torch.int32, device=self.device), ++ block_table=block_table, ++ slot_mapping=slot_mapping, ++ use_cascade=use_cascade, ++ common_prefix_len=common_prefix_len, ++ cu_prefix_query_lens=cu_prefix_query_lens, ++ prefix_kv_lens=prefix_kv_lens, ++ suffix_kv_lens=suffix_kv_lens, ++ ) ++ use_spec_decode = len( ++ scheduler_output.scheduled_spec_decode_tokens) > 0 ++ if not use_spec_decode: ++ # NOTE(woosuk): Due to chunked prefills, the batch may contain ++ # partial requests. While we should not sample any token ++ # from these partial requests, we do so for simplicity. ++ # We will ignore the sampled tokens from the partial requests. ++ # TODO: Support prompt logprobs. ++ logits_indices = attn_metadata.query_start_loc[1:] - 1 ++ spec_decode_metadata = None ++ else: ++ # Get the number of draft tokens for each request. ++ # Iterate over the dictionary rather than all requests since not all ++ # requests have draft tokens. ++ num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) ++ for req_id, draft_token_ids in ( ++ scheduler_output.scheduled_spec_decode_tokens.items()): ++ req_idx = self.input_batch.req_id_to_index[req_id] ++ num_draft_tokens[req_idx] = len(draft_token_ids) ++ ++ spec_decode_metadata = self._calc_spec_decode_metadata( ++ num_draft_tokens, cu_num_tokens) ++ logits_indices = spec_decode_metadata.logits_indices ++ ++ # Hot-Swap lora model ++ if self.lora_config: ++ self.set_active_loras(self.input_batch, num_scheduled_tokens) ++ ++ return attn_metadata, logits_indices, spec_decode_metadata ++ ++ def profile_run(self) -> None: ++ # Trigger compilation for general shape. ++ hidden_states = self._dummy_run(self.max_num_tokens) ++ logits = self.model.compute_logits(hidden_states, None) ++ logits = logits[:self.max_num_tokens] ++ torch.xpu.synchronize() ++ gc.collect() +diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py +new file mode 100644 +index 000000000..cc6d002eb +--- /dev/null ++++ b/vllm/v1/worker/xpu_worker.py +@@ -0,0 +1,178 @@ ++# SPDX-License-Identifier: Apache-2.0 ++import os ++from typing import Optional ++ ++import torch ++import torch.distributed ++ ++import vllm.envs as envs ++from vllm.config import ParallelConfig, VllmConfig ++from vllm.distributed import (ensure_model_parallel_initialized, ++ init_distributed_environment) ++from vllm.logger import init_logger ++from vllm.model_executor import set_random_seed ++from vllm.platforms import current_platform ++from vllm.v1.worker.gpu_worker import Worker ++from vllm.v1.worker.xpu_model_runner import XPUModelRunner ++ ++logger = init_logger(__name__) ++ ++ ++class XPUWorker(Worker): ++ """A XPU worker class.""" ++ ++ def __init__( ++ self, ++ vllm_config: VllmConfig, ++ local_rank: int, ++ rank: int, ++ distributed_init_method: str, ++ is_driver_worker: bool = False, ++ ): ++ super().__init__(vllm_config, local_rank, rank, ++ distributed_init_method, is_driver_worker) ++ device_config = self.device_config ++ assert device_config.device_type == "xpu" ++ assert current_platform.is_xpu() ++ ++ # Torch profiler. Enabled and configured through env vars: ++ # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace ++ if envs.VLLM_TORCH_PROFILER_DIR: ++ torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR ++ logger.info("Profiling enabled. Traces will be saved to: %s", ++ torch_profiler_trace_dir) ++ self.profiler = torch.profiler.profile( ++ activities=[ ++ torch.profiler.ProfilerActivity.CPU, ++ torch.profiler.ProfilerActivity.XPU, ++ ], ++ with_stack=True, ++ on_trace_ready=torch.profiler.tensorboard_trace_handler( ++ torch_profiler_trace_dir, use_gzip=True)) ++ else: ++ self.profiler = None ++ ++ def compile_or_warm_up_model(self) -> None: ++ pass ++ ++ # we provide this function due to `torch.xpu.mem_get_info()` doesn't ++ # return correct free_gpu_memory on intel client GPU. We need to ++ # calculate/estiamte it. ++ def xpu_get_mem_info(self): ++ if current_platform.is_data_center_gpu(): ++ return torch.xpu.mem_get_info() ++ else: ++ _, total_gpu_memory = torch.xpu.mem_get_info() ++ # FIXME: memory_allocated() doesn't count non-torch allocations, ++ # and we don't have any API to get it. so we mark it as 128MB. ++ used_memory = torch.xpu.memory_allocated() ++ non_torch_allocations = 128 * 1024 * 1024 ++ free_gpu_memory = total_gpu_memory - (used_memory + ++ non_torch_allocations) ++ return free_gpu_memory, total_gpu_memory ++ ++ @torch.inference_mode() ++ def determine_available_memory(self) -> int: ++ """Profiles the peak memory usage of the model to determine how many ++ KV blocks may be allocated without OOMs. ++ ++ The engine will first conduct a profiling of the existing memory usage. ++ Then, it calculate the maximum possible number of GPU and CPU blocks ++ that can be allocated with the remaining free memory. ++ ++ .. tip:: ++ You may limit the usage of GPU memory ++ by adjusting the `gpu_memory_utilization` parameter. ++ """ ++ # Profile the memory usage of the model and get the maximum number of ++ # cache blocks that can be allocated with the remaining free memory. ++ torch.xpu.empty_cache() ++ ++ # Execute a forward pass with dummy inputs to profile the memory usage ++ # of the model. ++ self.model_runner.profile_run() ++ ++ # Calculate the number of blocks that can be allocated with the ++ # profiled peak memory. ++ torch.xpu.synchronize() ++ used_memory = torch.xpu.memory_allocated() ++ reserved_memory = torch.xpu.memory_reserved() ++ total_gpu_memory = torch.xpu.get_device_properties( ++ self.local_rank).total_memory ++ free_gpu_memory = total_gpu_memory - used_memory ++ ++ # NOTE(woosuk): Here we assume that the other processes using the same ++ # GPU did not change their memory usage during the profiling. ++ peak_memory = self.init_gpu_memory - free_gpu_memory ++ assert peak_memory > 0, ( ++ "Error in memory profiling. " ++ f"Initial free memory {self.init_gpu_memory}, current free memory" ++ f" {free_gpu_memory}. This happens when the GPU memory was " ++ "not properly cleaned up before initializing the vLLM instance.") ++ ++ torch.xpu.empty_cache() ++ ++ available_kv_cache_memory = ( ++ total_gpu_memory * self.cache_config.gpu_memory_utilization - ++ peak_memory) ++ ++ self.cache_config.threshold_mem = reserved_memory + available_kv_cache_memory ++ return int(available_kv_cache_memory) ++ ++ def init_device(self): ++ if self.device_config.device.type == "xpu" and current_platform.is_xpu( ++ ): ++ self.device = torch.device(f"xpu:{self.local_rank}") ++ torch.xpu.set_device(self.device) ++ torch.xpu.empty_cache() ++ self.init_gpu_memory = torch.xpu.get_device_properties( ++ self.local_rank).total_memory ++ else: ++ raise RuntimeError( ++ f"Not support device type: {self.device_config.device}") ++ init_worker_distributed_environment(self.parallel_config, self.rank, ++ self.distributed_init_method, ++ self.local_rank) ++ # Set random seed. ++ set_random_seed(self.model_config.seed) ++ self.model_runner = XPUModelRunner( # type: ignore ++ self.vllm_config, self.device) ++ ++ ++def init_worker_distributed_environment( ++ parallel_config: ParallelConfig, ++ rank: int, ++ distributed_init_method: Optional[str] = None, ++ local_rank: int = -1, ++) -> None: ++ """Initialize the distributed environment.""" ++ ++ if torch.distributed.is_initialized(): ++ torch_world_size = torch.distributed.get_world_size() ++ if torch_world_size != parallel_config.world_size: ++ raise RuntimeError( ++ "torch.distributed is already initialized but the torch " ++ "world size does not match parallel_config.world_size " ++ f"({torch_world_size} vs. {parallel_config.world_size}).") ++ elif not distributed_init_method: ++ raise ValueError( ++ "distributed_init_method must be set if torch.distributed " ++ "is not already initialized") ++ else: ++ ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") ++ ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", ++ str(parallel_config.world_size)) ++ os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT ++ os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE ++ os.environ["LOCAL_RANK"] = str(local_rank) ++ init_distributed_environment( ++ world_size=parallel_config.world_size, ++ rank=rank, ++ distributed_init_method=distributed_init_method, ++ local_rank=local_rank, ++ backend="ccl") ++ ++ ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, ++ parallel_config.pipeline_parallel_size) ++ # global all_reduce needed for overall oneccl warm up ++ torch.distributed.all_reduce(torch.zeros(1).xpu()) +diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py +index 9d49b4385..443fae0cf 100644 +--- a/vllm/worker/xpu_model_runner.py ++++ b/vllm/worker/xpu_model_runner.py +@@ -5,7 +5,7 @@ import time + import weakref + from collections import defaultdict + from dataclasses import dataclass +-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, ++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, + Type, TypeVar) + + import torch +@@ -17,15 +17,23 @@ from vllm.distributed import get_pp_group + from vllm.forward_context import set_forward_context + from vllm.inputs import INPUT_REGISTRY, InputRegistry + from vllm.logger import init_logger ++from vllm.lora.layers import LoRAMapping ++from vllm.lora.request import LoRARequest ++from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager + from vllm.model_executor import SamplingMetadataCache ++from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding + from vllm.model_executor.layers.sampler import SamplerOutput ++from vllm.model_executor.models import supports_lora, supports_multimodal + from vllm.model_executor.model_loader import get_model + from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) ++from vllm.prompt_adapter.request import PromptAdapterRequest + from vllm.sampling_params import SamplingParams + from vllm.sequence import IntermediateTensors, SequenceGroupMetadata +-from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad ++from vllm.utils import ( ++ DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad, PyObjectCache, ++ is_pin_memory_available, flatten_2d_lists) + from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata + from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, +@@ -37,6 +45,8 @@ from vllm.worker.model_runner_base import ( + if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + ++LORA_WARMUP_RANK = 8 ++ + logger = init_logger(__name__) + + _PAD_SLOT_ID = -1 +@@ -56,12 +66,16 @@ class ModelInputForXPU(ModelRunnerInputBase): + virtual_engine: Optional[int] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None ++ lora_mapping: Optional["LoRAMapping"] = None ++ lora_requests: Optional[Set[LoRARequest]] = None + async_callback: Optional[Callable] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, ++ "lora_requests": self.lora_requests, ++ "lora_mapping": self.lora_mapping, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + +@@ -85,11 +99,14 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None ++ is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, ++ "lora_requests": self.lora_requests, ++ "lora_mapping": self.lora_mapping, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, +@@ -110,6 +127,245 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): + + + class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): ++ class InterDataForSeqGroup: ++ """Intermediate data for the current sequence group.""" ++ ++ def simple_reinit(self): ++ self.input_tokens[0].clear() # type: ignore ++ self.input_positions[0].clear() # type: ignore ++ self.token_types[0].clear() # type: ignore ++ self.mrope_input_positions = None # type: ignore ++ self.seq_lens[0] = 0 # type: ignore ++ self.orig_seq_lens[0] = 0 # type: ignore ++ self.query_lens[0] = 0 # type: ignore ++ self.context_lens[0] = 0 # type: ignore ++ self.curr_sliding_window_blocks[0] = 0 # type: ignore ++ self.lora_index_mapping.clear() # type: ignore ++ self.lora_prompt_mapping.clear() # type: ignore ++ self.lora_requests.clear() # type: ignore ++ self.prompt_adapter_index_mapping.clear() # type: ignore ++ self.prompt_adapter_prompt_mapping.clear() # type: ignore ++ ++ def __init__( ++ self, ++ *, ++ # From sequence group metadata. ++ request_id: str, ++ seq_ids: List[int], ++ is_prompt: bool, ++ block_tables: Optional[Dict[int, List[int]]], ++ computed_block_nums: List[int], ++ n_seqs: int = 0, ++ ++ # Input tokens and positions. ++ input_tokens: Optional[List[List[int]]] = None, ++ input_positions: Optional[List[List[int]]] = None, ++ token_types: Optional[List[List[int]]] = None, ++ mrope_input_positions: Optional[List[List[List[int]]]] = None, ++ ++ # The sequence length (may be capped to the sliding window). ++ seq_lens: Optional[List[int]] = None, ++ # The original sequence length (before applying sliding window). ++ # This is used to compute slot mapping. ++ orig_seq_lens: Optional[List[int]] = None, ++ # The query length. ++ query_lens: Optional[List[int]] = None, ++ # The number of tokens that are already computed. ++ context_lens: Optional[List[int]] = None, ++ # The current sliding window block. ++ curr_sliding_window_blocks: Optional[List[int]] = None, ++ ++ # LoRA inputs. ++ lora_index_mapping: Optional[List[List[int]]] = None, ++ lora_prompt_mapping: Optional[List[List[int]]] = None, ++ lora_requests: Optional[Set[LoRARequest]] = None, ++ ++ # Prompt adapter inputs. ++ prompt_adapter_index_mapping: Optional[List[int]] = None, ++ prompt_adapter_prompt_mapping: Optional[List[int]] = None, ++ prompt_adapter_request: Optional[PromptAdapterRequest] = None, ++ ++ # Multi-modal inputs. ++ multi_modal_kwargs: Optional[MultiModalKwargs] = None, ++ multi_modal_placeholder_maps: Optional[Dict[ ++ str, MultiModalPlaceholderMap]] = None, ++ ++ # Whether the prefix cache is hit (prefill only). ++ prefix_cache_hit: bool = False, ++ reinit: bool = False, ++ reinit_use_defaults: bool = False, ++ encoder_seq_len: int = 0, ++ ): ++ if reinit: ++ assert len(self.seq_ids) == len(seq_ids) # type: ignore ++ for i, seq_id in enumerate(seq_ids): ++ self.seq_ids[i] = seq_id # type: ignore ++ else: ++ self.seq_ids = seq_ids ++ ++ self.request_id = request_id ++ self.is_prompt = is_prompt ++ self.block_tables = block_tables ++ self.computed_block_nums = computed_block_nums ++ self.n_seqs = n_seqs ++ self.encoder_seq_len = encoder_seq_len ++ ++ if reinit: ++ if len(self.seq_ids) == 1 and reinit_use_defaults: ++ self.simple_reinit() ++ else: ++ if input_tokens: ++ self.input_tokens = input_tokens ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.input_tokens[seq_id].clear() ++ ++ if input_positions: ++ self.input_positions = input_positions ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.input_positions[seq_id].clear() ++ ++ if token_types: ++ self.token_types = token_types ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.token_types[seq_id].clear() ++ ++ self.mrope_input_positions = None ++ ++ if seq_lens: ++ self.seq_lens = seq_lens ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.seq_lens[seq_id] = 0 ++ ++ if orig_seq_lens: ++ self.orig_seq_lens = orig_seq_lens ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.orig_seq_lens[seq_id] = 0 ++ ++ if query_lens: ++ self.query_lens = query_lens ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.query_lens[seq_id] = 0 ++ ++ if context_lens: ++ self.context_lens = context_lens ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.context_lens[seq_id] = 0 ++ ++ if curr_sliding_window_blocks: ++ self.curr_sliding_window_blocks = \ ++ curr_sliding_window_blocks ++ else: ++ for seq_id in range(len(self.seq_ids)): ++ self.curr_sliding_window_blocks[seq_id] = 0 ++ ++ if lora_index_mapping: ++ self.lora_index_mapping = lora_index_mapping ++ else: ++ self.lora_index_mapping.clear() ++ ++ if lora_prompt_mapping: ++ self.lora_prompt_mapping = lora_prompt_mapping ++ else: ++ self.lora_prompt_mapping.clear() ++ ++ if lora_requests: ++ self.lora_requests = lora_requests ++ else: ++ self.lora_requests.clear() ++ ++ if prompt_adapter_index_mapping: ++ self.prompt_adapter_index_mapping = \ ++ prompt_adapter_index_mapping ++ else: ++ self.prompt_adapter_index_mapping.clear() ++ ++ if prompt_adapter_prompt_mapping: ++ self.prompt_adapter_prompt_mapping = \ ++ prompt_adapter_prompt_mapping ++ else: ++ self.prompt_adapter_prompt_mapping.clear() ++ ++ else: ++ self.input_tokens = input_tokens or [] ++ self.input_positions = input_positions or [] ++ self.token_types = token_types or [] ++ self.mrope_input_positions = mrope_input_positions or None ++ self.seq_lens = seq_lens or [] ++ self.orig_seq_lens = orig_seq_lens or [] ++ self.query_lens = query_lens or [] ++ self.context_lens = context_lens or [] ++ self.curr_sliding_window_blocks = \ ++ curr_sliding_window_blocks or [] ++ ++ self.lora_index_mapping = lora_index_mapping or [] ++ self.lora_prompt_mapping = lora_prompt_mapping or [] ++ self.lora_requests = lora_requests or set() ++ ++ self.prompt_adapter_index_mapping = ( ++ prompt_adapter_index_mapping or []) ++ self.prompt_adapter_prompt_mapping = ( ++ prompt_adapter_prompt_mapping or []) ++ ++ self.prompt_adapter_request = prompt_adapter_request ++ self.multi_modal_kwargs = multi_modal_kwargs ++ self.multi_modal_placeholder_maps = multi_modal_placeholder_maps ++ self.prefix_cache_hit = prefix_cache_hit ++ ++ self.n_seqs = len(self.seq_ids) ++ ++ if not reinit: ++ self.__post_init__() ++ ++ def __post_init__(self): ++ self.n_seqs = len(self.seq_ids) ++ ++ self.input_tokens = [[] for _ in range(self.n_seqs)] ++ self.input_positions = [[] for _ in range(self.n_seqs)] ++ self.token_types = [[] for _ in range(self.n_seqs)] ++ self.mrope_input_positions = None ++ self.seq_lens = [0] * self.n_seqs ++ self.orig_seq_lens = [0] * self.n_seqs ++ self.query_lens = [0] * self.n_seqs ++ self.context_lens = [0] * self.n_seqs ++ self.curr_sliding_window_blocks = [0] * self.n_seqs ++ ++ self.lora_index_mapping = [] ++ self.lora_prompt_mapping = [] ++ ++ def gen_inter_data_builder(self, num_seqs: int): ++ return lambda: ModelInputForXPUBuilder.InterDataForSeqGroup( ++ request_id="", ++ seq_ids=[0] * num_seqs, ++ is_prompt=True, ++ block_tables=None, ++ computed_block_nums=[]) ++ ++ def init_cached_inter_data(self, *args, **kwargs): ++ assert len(args) == 0 ++ assert "seq_ids" in kwargs ++ seq_ids = kwargs["seq_ids"] ++ num_seqs = len(seq_ids) ++ ++ # The inter-data cache is per model_runner ++ inter_data_cache = self.runner.inter_data_cache ++ if num_seqs not in inter_data_cache: ++ inter_data_cache[num_seqs] = PyObjectCache( ++ self.gen_inter_data_builder(num_seqs)) ++ ++ obj = inter_data_cache[num_seqs].get_object() ++ obj.__init__(*args, **kwargs) ++ return obj ++ ++ def reset_cached_inter_data(self): ++ for cache in self.runner.inter_data_cache.values(): ++ cache.reset() + + def __init__(self, + runner: "XPUModelRunner", +@@ -121,15 +377,139 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): + self.sliding_window = self.runner.sliding_window + self.block_size = self.runner.block_size + self.device = self.runner.device ++ self.enable_lora = self.runner.lora_config is not None ++ self.scheduler_config = self.runner.scheduler_config ++ self.chunked_prefill_enabled = ( ++ self.scheduler_config is not None ++ and self.scheduler_config.chunked_prefill_enabled) + + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] ++ self.decode_only = True ++ # Intermediate data (data in CPU before going to GPU) for ++ # the current sequence group. ++ self.inter_data_list: List[ ++ ModelInputForXPUBuilder.InterDataForSeqGroup] = [] ++ ++ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, ++ seq_group_metadata: SequenceGroupMetadata): ++ """Compute context length, sequence length and tokens ++ for the given sequence data. ++ """ ++ seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] ++ token_chunk_size = seq_group_metadata.token_chunk_size ++ ++ # Compute context length (the number of tokens that are ++ # already computed) and sequence length (total number of tokens). ++ ++ seq_len = seq_data.get_len() ++ if inter_data.is_prompt: ++ context_len = seq_data.get_num_computed_tokens() ++ seq_len = min(seq_len, context_len + token_chunk_size) ++ elif self.runner.scheduler_config.is_multi_step or \ ++ self.runner.model_config.is_encoder_decoder: ++ context_len = seq_len - 1 ++ else: ++ context_len = seq_data.get_num_computed_tokens() ++ ++ # Compute tokens. ++ tokens = seq_data.get_token_ids()[context_len:seq_len] ++ token_types = seq_group_metadata.token_type_ids ++ ++ inter_data.seq_lens[seq_idx] = seq_len ++ inter_data.orig_seq_lens[seq_idx] = seq_len ++ inter_data.context_lens[seq_idx] = context_len ++ inter_data.input_tokens[seq_idx].extend(tokens) ++ inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) ++ inter_data.token_types[seq_idx].extend( ++ token_types if token_types else []) ++ inter_data.query_lens[seq_idx] = seq_len - context_len ++ ++ if seq_data.mrope_position_delta is not None: ++ if inter_data.mrope_input_positions is None: ++ inter_data.mrope_input_positions = [None] * inter_data.n_seqs ++ ++ inter_data.mrope_input_positions[ ++ seq_idx] = MRotaryEmbedding.get_next_input_positions( ++ seq_data.mrope_position_delta, ++ context_len, ++ seq_len, ++ ) ++ ++ def _compute_lora_input(self, inter_data: InterDataForSeqGroup, ++ seq_idx: int, ++ seq_group_metadata: SequenceGroupMetadata): ++ """If LoRA is enabled, compute LoRA index and prompt mapping.""" ++ if not self.enable_lora: ++ return ++ ++ lora_id = seq_group_metadata.lora_int_id ++ if lora_id > 0: ++ inter_data.lora_requests.add(seq_group_metadata.lora_request) ++ query_len = inter_data.query_lens[seq_idx] ++ inter_data.lora_index_mapping.append([lora_id] * query_len) ++ sampling_params = seq_group_metadata.sampling_params ++ if sampling_params and sampling_params.prompt_logprobs is not None: ++ inter_data.lora_prompt_mapping.append([lora_id] * query_len) ++ elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: ++ inter_data.lora_prompt_mapping.append([lora_id]) ++ else: ++ inter_data.lora_prompt_mapping.append([]) + + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): ++ seq_ids = seq_group_metadata.seq_data.keys() ++ n_seqs = len(seq_ids) ++ is_prompt = seq_group_metadata.is_prompt ++ ++ if is_prompt: ++ assert n_seqs == 1 ++ self.decode_only = False ++ ++ encoder_seq_len = 0 ++ ++ if self.runner.model_config.is_encoder_decoder: ++ encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() ++ ++ inter_data = self.init_cached_inter_data( ++ request_id=seq_group_metadata.request_id, ++ seq_ids=seq_ids, ++ is_prompt=is_prompt, ++ block_tables=seq_group_metadata.block_tables, ++ computed_block_nums=seq_group_metadata.computed_block_nums, ++ reinit=True, ++ reinit_use_defaults=True, ++ encoder_seq_len=encoder_seq_len) ++ ++ self.inter_data_list.append(inter_data) ++ ++ for seq_idx in range(n_seqs): ++ self._compute_lens(inter_data, seq_idx, seq_group_metadata) ++ self._compute_lora_input(inter_data, seq_idx, seq_group_metadata) ++ + self.seq_group_metadata_list.append(seq_group_metadata) + + def build(self) -> ModelInputForXPU: ++ # LoRA data. ++ lora_requests = set() ++ lora_mapping = None ++ if self.enable_lora: ++ lora_requests = set(r for data in self.inter_data_list ++ for r in data.lora_requests) ++ lora_index_mapping = flatten_2d_lists([ ++ flatten_2d_lists(inter_data.lora_index_mapping) ++ for inter_data in self.inter_data_list ++ ]) ++ lora_prompt_mapping = flatten_2d_lists([ ++ flatten_2d_lists(inter_data.lora_prompt_mapping) ++ for inter_data in self.inter_data_list ++ ]) ++ ++ lora_mapping = LoRAMapping( ++ **dict(index_mapping=lora_index_mapping, ++ prompt_mapping=lora_prompt_mapping, ++ is_prefill=not self.decode_only)) ++ + is_prompt = self.seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: +@@ -150,6 +530,8 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): + multi_modal_kwargs=multi_modal_kwargs, + seq_lens=seq_lens, + query_lens=seq_lens, ++ lora_mapping=lora_mapping, ++ lora_requests=lora_requests, + ) + + def _prepare_prompt( +@@ -388,6 +770,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + self.return_hidden_states = return_hidden_states + + self.device = self.device_config.device ++ self.pin_memory = is_pin_memory_available() + + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = model_config.get_sliding_window() +@@ -410,6 +793,11 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + + # Lazy initialization. + self.model: nn.Module # Set after init_Model ++ # Set after load_model. ++ self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None ++ ++ # Used to cache python objects ++ self.inter_data_cache: Dict[int, PyObjectCache] = {} + + self.sampling_metadata_cache: SamplingMetadataCache = \ + SamplingMetadataCache() \ +@@ -425,6 +813,25 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + logger.info("Loading model weights took %.4f GiB", + self.model_memory_usage / GiB_bytes) + ++ if self.lora_config: ++ assert supports_lora(self.model), "Model does not support LoRA" ++ assert not supports_multimodal( ++ self.model ++ ), "To be tested: Multi-modal model with LoRA settings." ++ ++ self.lora_manager = LRUCacheWorkerLoRAManager( ++ self.scheduler_config.max_num_seqs, ++ self.scheduler_config.max_num_batched_tokens, ++ self.vocab_size, ++ self.lora_config, ++ self.device, ++ self.model.embedding_modules, ++ self.model.embedding_padding_modules, ++ max_position_embeddings=self.model.config. ++ max_position_embeddings, ++ ) ++ self.model = self.lora_manager.create_lora_manager(self.model) ++ + def get_model(self) -> nn.Module: + return self.model + +@@ -439,6 +846,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + ++ # This represents the maximum number of different requests ++ # that will have unique loras, an therefore the max amount of memory ++ # consumption create dummy lora request copies from the lora request ++ # passed in, which contains a lora from the lora warmup path. ++ dummy_lora_requests: List[LoRARequest] = [] ++ dummy_lora_requests_per_seq: List[LoRARequest] = [] ++ if self.lora_config: ++ assert self.lora_manager is not None ++ with self.lora_manager.dummy_lora_cache(): ++ for idx in range(self.lora_config.max_loras): ++ lora_id = idx + 1 ++ dummy_lora_request = LoRARequest( ++ lora_name=f"warmup_{lora_id}", ++ lora_int_id=lora_id, ++ lora_path="/not/a/real/path", ++ ) ++ self.lora_manager.add_dummy_lora(dummy_lora_request, ++ rank=LORA_WARMUP_RANK) ++ dummy_lora_requests.append(dummy_lora_request) ++ dummy_lora_requests_per_seq = [ ++ dummy_lora_requests[idx % len(dummy_lora_requests)] ++ for idx in range(max_num_seqs) ++ ] ++ + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] +@@ -479,7 +910,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, +- lora_request=None, ++ lora_request=dummy_lora_requests_per_seq[group_id] ++ if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders) + seqs.append(seq) +@@ -522,6 +954,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + for seq_group_metadata in seq_group_metadata_list: + builder.add_seq_group(seq_group_metadata) + ++ self.builder.reset_cached_inter_data() ++ + return builder.build() # type: ignore + + def prepare_model_input( +@@ -538,17 +972,20 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + seq_group_metadata_list, finished_requests_ids) + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) +- sampling_metadata = SamplingMetadata.prepare( +- seq_group_metadata_list, +- model_input.seq_lens, +- model_input.query_lens, +- self.device, +- pin_memory=False, +- generators=generators, +- cache=self.sampling_metadata_cache) +- ++ if get_pp_group().is_last_rank: ++ # Sampling metadata is only required for the final pp group ++ generators = self.get_generators(finished_requests_ids) ++ sampling_metadata = SamplingMetadata.prepare( ++ seq_group_metadata_list, model_input.seq_lens, ++ model_input.query_lens, self.device, self.pin_memory, ++ generators, self.sampling_metadata_cache) ++ else: ++ sampling_metadata = None ++ is_prompt = (seq_group_metadata_list[0].is_prompt ++ if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, ++ is_prompt=is_prompt, + virtual_engine=virtual_engine) + + @torch.inference_mode() +@@ -563,6 +1000,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + raise ValueError( + "XPUModelRunner does not support multi-step execution.") + ++ if self.lora_config: ++ assert model_input.lora_requests is not None ++ assert model_input.lora_mapping is not None ++ self.set_active_loras(model_input.lora_requests, ++ model_input.lora_mapping) ++ + model_executable = self.model + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): +@@ -612,3 +1055,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + output.model_forward_time = model_forward_time + + return [output] ++ ++ def set_active_loras(self, lora_requests: Set[LoRARequest], ++ lora_mapping: LoRAMapping) -> None: ++ if not self.lora_manager: ++ raise RuntimeError("LoRA is not enabled.") ++ self.lora_manager.set_active_adapters(lora_requests, lora_mapping) ++ ++ def add_lora(self, lora_request: LoRARequest) -> bool: ++ if not self.lora_manager: ++ raise RuntimeError("LoRA is not enabled.") ++ return self.lora_manager.add_adapter(lora_request) +diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py +index 3aea0d741..203c7cd01 100644 +--- a/vllm/worker/xpu_worker.py ++++ b/vllm/worker/xpu_worker.py +@@ -18,13 +18,13 @@ from vllm.model_executor import set_random_seed + from vllm.platforms import current_platform + from vllm.worker.cache_engine import CacheEngine + from vllm.worker.worker import Worker +-from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase ++from vllm.worker.worker_base import WorkerBase + from vllm.worker.xpu_model_runner import XPUModelRunner + + logger = init_logger(__name__) + + +-class XPUWorker(LoRANotSupportedWorkerBase, Worker): ++class XPUWorker(Worker): + """A worker class that executes (a partition of) the model on a GPU. + + Each worker is associated with a single XPU device. The worker is diff --git a/vllm/tools/download_tool.sh b/vllm/tools/download_tool.sh new file mode 100644 index 0000000..9ea3fa4 --- /dev/null +++ b/vllm/tools/download_tool.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +git clone https://github.com/oneapi-src/oneAPI-samples.git +cd oneAPI-samples +git apply ../../patch/oneapi-samples-enable-correctness-check.patch diff --git a/vllm/tools/native_bkc_setup.sh b/vllm/tools/native_bkc_setup.sh new file mode 100644 index 0000000..5869e8c --- /dev/null +++ b/vllm/tools/native_bkc_setup.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Environment Setup Script for Intel GPU Multi-Arch Development +# Run this script as root to ensure consistent environment + +if [ "$(id -u)" -ne 0 ]; then + echo "[ERROR] This script must be run as root. Exiting." + exit 1 +fi + +# Enable strict mode +set -euo pipefail +trap 'echo "[ERROR] Script failed at line $LINENO."' ERR + +# Output both to terminal and log file +exec > >(tee -i /var/log/multi_arc_setup_env.log) 2>&1 + +echo -e "\n[INFO] Starting environment setup..." + +WORK_DIR=~/multi-arc +mkdir -p "$WORK_DIR" +cd "$WORK_DIR" + +# Proxy settings (optional, adjust as needed) +export https_proxy=http://child-prc.intel.com:913 +export http_proxy=http://child-prc.intel.com:913 +export no_proxy=127.0.0.1,*.intel.com + +# Internet access check +echo "[INFO] Testing internet access..." +if ! curl -s --connect-timeout 10 https://www.google.com >/dev/null; then + echo "[WARNING] Internet access through proxy may be unavailable." +fi + +echo -e "\n[INFO] Installing base libraries..." +apt update +apt install -y vim clinfo build-essential hwinfo net-tools openssh-server curl pkg-config flex bison libelf-dev libssl-dev libncurses-dev git libboost1.83-all-dev cmake libpng-dev docker.io docker-compose-v2 + +echo -e "\n[INFO] Adding Intel repository and graphics-testing PPA..." +if [ ! -f /usr/share/keyrings/oneapi-archive-keyring.gpg ]; then + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null +fi + +echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list +add-apt-repository -y ppa:kobuk-team/intel-graphics-testing +apt update + +echo -e "\n[INFO] Downloading and installing GPU firmware..." +FIRMWARE_DIR=$WORK_DIR/firmware +mkdir -p "$FIRMWARE_DIR" +cd "$FIRMWARE_DIR" +rm -rf ./* +wget https://gitlab.com/kernel-firmware/linux-firmware/-/raw/main/xe/bmg_guc_70.bin +wget https://gitlab.com/kernel-firmware/linux-firmware/-/raw/main/xe/bmg_huc.bin +zstd -1 bmg_guc_70.bin -o bmg_guc_70.bin.zst +zstd -1 bmg_huc.bin -o bmg_huc.bin.zst + +if [ -d /lib/firmware/xe ]; then + cp *.zst /lib/firmware/xe +else + echo "[ERROR] /lib/firmware/xe does not exist. Ensure your system supports Xe firmware." + exit 1 +fi + +echo -e "\n[INFO] Installing GPU base libraries..." +apt install -y libigdgmm12=22.7.2-0ubuntu1~25.04~ppa1 libigc2=2.11.9-1144~25.04 + +echo -e "\n[INFO] Installing Compute libraries..." +apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing + +echo -e "\n[INFO] Installing Thread Building Blocks (TBB)..." +apt install -y libtbb12=2022.0.0-2 libtbbmalloc2=2022.0.0-2 + +echo -e "\n[INFO] Installing Media SDK and related drivers..." +apt install -y intel-media-va-driver-non-free=25.2.2-0ubuntu1~25.04~ppa1 +apt install -y vainfo=2.22.0+ds1-2 +apt install -y libvpl2=1:2.15.0-0ubuntu1~25.04~ppa2 +apt install -y libvpl-tools=1.4.0-0ubuntu1~25.04~ppa1 +apt install -y libmfx-gen1=25.2.2-0ubuntu1~25.04~ppa1 +apt install -y libmfx-gen-dev=25.2.2-0ubuntu1~25.04~ppa1 +apt install -y va-driver-all=2.22.0-3ubuntu2 + +echo -e "\n[INFO] Installing XPU manager libraries..." +apt install -y libmetee4=4.3.0-0ubuntu1~25.04~ppa1 +apt install -y intel-gsc=0.9.5-0ubuntu1~25.04~ppa1 +apt install -y intel-metrics-discovery=1.14.180-0ubuntu1~25.04~ppa1 +apt install -y intel-metrics-library=1.0.196-0ubuntu1~25.04~ppa1 + +echo -e "\n[INFO] Installing Mesa graphics libraries..." +apt install -y libegl-mesa0=25.0.3-1ubuntu2 +apt install -y libegl1-mesa-dev=25.0.3-1ubuntu2 +apt install -y libgl1-mesa-dri=25.0.3-1ubuntu2 +apt install -y libgles2-mesa-dev=25.0.3-1ubuntu2 +apt install -y libglx-mesa0=25.0.3-1ubuntu2 +apt install -y libxatracker2=25.0.3-1ubuntu2 +apt install -y mesa-va-drivers=25.0.3-1ubuntu2 +apt install -y mesa-vdpau-drivers=25.0.3-1ubuntu2 +apt install -y mesa-vulkan-drivers=25.0.3-1ubuntu2 + +echo -e "\n[INFO] Installing Intel OneAPI base toolkit..." +apt install -y intel-oneapi-base-toolkit=2025.1.3-6 + +echo -e "\n[INFO] Cloning and building level-zero-tests..." +cd $WORK_DIR +rm -rf level-zero-tests +git clone https://github.com/oneapi-src/level-zero-tests.git +cd level-zero-tests +git checkout 6f4258713c57ed1668671e5c016633624602184d +mkdir build && cd build +cmake .. +make -j$(nproc) + +echo -e "\n[INFO] Creating setup_perf.sh..." +cd $WORK_DIR +cat << 'EOF' > setup_perf.sh +#!/bin/bash +gpu_num=$(sudo xpu-smi discovery | grep card | wc -l) +for((i=0; i<$gpu_num; i++)); do + echo "Set GPU $i freq to 2400Mhz" + sudo xpu-smi config -d $i -t 0 --frequencyrange 2400,2400 +done + +echo "Set CPU to performance mode" +echo "performance" | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +echo 0 | sudo tee /sys/devices/system/cpu/cpu*/power/energy_perf_bias +EOF + +chmod +x setup_perf.sh + +echo -e "\n[INFO] Configuring user groups and Wayland settings..." +gpasswd -a ${USER} render || true +if [ -f /etc/gdm3/custom.conf ]; then + sed -i "s/^#*WaylandEnable=.*/WaylandEnable=true/" /etc/gdm3/custom.conf +fi +update-initramfs -u + +echo -e "\n[INFO] Disabling intel_iommu..." +GRUB_FILE="/etc/default/grub" +if [ -f "$GRUB_FILE" ]; then + cp "$GRUB_FILE" "${GRUB_FILE}.bak" + sed -i 's/^GRUB_CMDLINE_LINUX_DEFAULT=.*/GRUB_CMDLINE_LINUX_DEFAULT="quiet splash intel_iommu=off"/' "$GRUB_FILE" + update-grub +else + echo "[ERROR] Could not find $GRUB_FILE" + exit 1 +fi + +echo -e "\n Tools and scripts are located at /root/multi-arc." +echo -e "\n✅ [DONE] Environment setup complete. Please reboot your system to apply changes."