Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions vllm/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# ======== Base Stage ========
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base

ARG https_proxy
ARG http_proxy

# Add Intel oneAPI repo and PPA for GPU support
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics-testing

# Install dependencies and Python 3.10
RUN apt-get update -y && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update -y && \
apt-get install -y python3.10 python3.10-distutils && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
ffmpeg \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
wget \
vim \
linux-libc-dev && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
# Install Intel GPU runtime packages
apt-get update -y && \
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
apt-get clean && rm -rf /var/lib/apt/lists/*

WORKDIR /llm
COPY ./patches/vllm_for_multi_arc.patch /tmp/
COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/

# Set environment variables early
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

# Clone + patch vllm
RUN git clone -b v0.8.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
git apply /tmp/vllm_for_multi_arc.patch && \
pip install --no-cache-dir -r requirements/xpu.txt && \
python3 setup.py install

# ======= Add oneCCL build =======
RUN apt-get update && apt-get install -y \
cmake \
g++ \
&& rm -rf /var/lib/apt/lists/*

# Build 1ccl
RUN git clone https://github.com/oneapi-src/oneCCL.git && \
cd oneCCL && \
git checkout def870543749186b6f38cdc865b44d52174c7492 && \
git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
mkdir build && cd build && \
export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
/usr/bin/cmake .. \
-DCMAKE_INSTALL_PREFIX=_install \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCOMPUTE_BACKEND=dpcpp \
-DCCL_ENABLE_ARCB=1 && \
make -j && make install && \
mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
cd /opt/intel/oneapi/ccl/ && \
ln -snf 2021.15.3 latest && \
source /opt/intel/oneapi/setvars.sh --force

WORKDIR /llm/vllm

# Cleanup patch file
RUN rm -rf /tmp/*

CMD ["/bin/bash"]



# ======== OpenAI Serving Stage ========
FROM vllm-base AS vllm-openai

ARG http_proxy
ARG https_proxy

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'

# Set additional environment for production usage
ENV VLLM_USAGE_SOURCE=production-docker-image
ENV TRITON_XPU_PROFILE=1

# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
125 changes: 125 additions & 0 deletions vllm/patches/0001-oneccl-align-global-V0.1.1.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
From: YongZhuIntel <yong.zhu@intel.com>
Date: Wed, 21 May 2025 09:37:06 +0800
Subject: [PATCH] oneccl align global V0.1.1

base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)

Build:
1. mkdir build; cd build
2. source /opt/intel/oneapi/setvars.sh
3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install

print bandwidth in benchmark
---
examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
examples/benchmark/src/benchmark.cpp | 7 +++--
2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 08a3625..bff6275 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
double max_time,
double avg_time,
double stddev,
- double wait_avg_time) {
+ double wait_avg_time,
+ double algbw,
+ double busbw) {
std::ofstream csvf;
csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);

@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
<< "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
<< ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
<< iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
- << stddev << "," << wait_avg_time << std::endl;
+ << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
}
csvf.close();
}
@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
max_time /= iter_count;

size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
+
+ double algbw = bytes*1000/total_avg_time/1024/1024;
+
+ if (ncolls == 1) {
+ if (options.coll_names.front() == "allgather" ||
+ options.coll_names.front() == "allgatherv" ||
+ options.coll_names.front() == "reducescatter" ||
+ options.coll_names.front() == "alltoall" ||
+ options.coll_names.front() == "alltoallv") {
+ algbw = algbw * nranks;
+ }
+ }
+
+ double busbw = algbw;
+ if (ncolls == 1) {
+ if (options.coll_names.front() == "allreduce") {
+ busbw = algbw * 2 * (nranks -1) / nranks;
+ } else if (options.coll_names.front() == "allgather" ||
+ options.coll_names.front() == "allgatherv" ||
+ options.coll_names.front() == "reducescatter" ||
+ options.coll_names.front() == "alltoall" ||
+ options.coll_names.front() == "alltoallv") {
+ busbw = algbw * (nranks -1) / nranks;
+ }
+ }
+
std::stringstream ss;
ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
<< elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
- << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
+ << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
+ << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
+ << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);

if (show_extened_info(options.show_additional_info)) {
ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
max_time,
total_avg_time,
stddev,
- wait_avg_time);
+ wait_avg_time,
+ algbw,
+ busbw);
}
}

diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
index d90fb9b..78957f2 100644
--- a/examples/benchmark/src/benchmark.cpp
+++ b/examples/benchmark/src/benchmark.cpp
@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
<< "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
<< std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
<< std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
- << "stddev[%]";
+ << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
+ << "busbw[GB/s]";

if (show_extened_info(options.show_additional_info)) {
ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
<< "t_max[usec],"
<< "t_avg[usec],"
<< "stddev[%],"
- << "wait_t_avg[usec]" << std::endl;
+ << "wait_t_avg[usec],"
+ << "algbw[GB/s],"
+ << "busbw[GB/s]" << std::endl;
csvf.close();
}

--
2.25.1

118 changes: 118 additions & 0 deletions vllm/patches/oneapi-samples-enable-correctness-check.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
From 592d83da1e8d00b4436b51ccf17e7d0bc9564e24 Mon Sep 17 00:00:00 2001
From: jilongW <109333127+jilongW@users.noreply.github.com>
Date: Fri, 6 Jun 2025 14:27:43 +0800
Subject: [PATCH] add compare (#1)

* add comparision

* update compare

* remove extra file
---
.../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp | 66 ++++++++++++++++---
1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
index 427505b7..989c00ad 100644
--- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
+++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
@@ -41,17 +41,55 @@ bool test(queue &Q, int M, int N, int K)
constexpr int rd_size = 1048576;
std::vector<T> host_vector(rd_size);
auto host_data = host_vector.data();
-
+ std::vector<T> correct_host_vector(rd_size);
+ auto correct_host_data = correct_host_vector.data();
/* Measure time for a given number of GEMM calls */
- auto time_gemms = [=, &Q](int runs) -> double {
+ bool verify = false;
+ auto time_gemms = [=, &Q, &host_data](int runs, bool verify=false) -> std::tuple<double, int> {
using namespace oneapi::mkl;
using namespace std::chrono;
auto start = steady_clock::now();
- for (int i = 0; i < runs; i++)
+ int ok = 0;
+ if (verify == false){
+ for (int i = 0; i < runs; i++)
+ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
+ Q.wait_and_throw();
+ auto end = steady_clock::now();
+ return std::make_tuple(duration<double>(end - start).count(), ok);
+ }
+ else{
+ size_t elems = std::min(ldc * N, rd_size);
+
blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
- Q.wait_and_throw();
- auto end = steady_clock::now();
- return duration<double>(end - start).count();
+ Q.wait_and_throw();
+ Q.copy(C, correct_host_data, elems).wait();
+ auto end = steady_clock::now();
+ auto used_time = duration<double>(end - start).count();
+
+ // correct_host_data[0] += 1.0;
+ for (int i = 1; i < runs; i++){
+ start = steady_clock::now();
+ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
+ Q.wait_and_throw();
+ end = steady_clock::now();
+ used_time += duration<double>(end - start).count();
+ Q.copy(C, host_data, elems).wait();
+ int linear_id = 0;
+ for (size_t j = 0; j < N; j++) {
+ for (size_t k = 0; k < M; k++) {
+ linear_id = j*ldc + k;
+ if (linear_id >= elems) break;
+ if (host_data[linear_id] != correct_host_data[linear_id]) {
+ ok = i;
+ return std::make_tuple(duration<double>(end - start).count(), ok);
+ }
+ }
+ if (linear_id >= elems) break;
+ }
+
+ }
+ return std::make_tuple(used_time, ok);
+ }
};

/* Fill A/B with all ones to verify correctness */
@@ -91,13 +129,15 @@ bool test(queue &Q, int M, int N, int K)

/* Time one GEMM call, and estimate how many calls will be required to keep the
* GPU busy for 1s. */
- auto tare = time_gemms(1);
+ auto [tare, _] = time_gemms(1, true);
int ncalls = std::max(4, std::min(1000, int(1. / tare)));

/* Time that many GEMMs, subtracting the first call time to remove host overhead.
* This gives a better idea of device performance. */
std::cout << " -> Timing...\n";
- auto time = time_gemms(ncalls + 1) - tare;
+ auto [time, result] = time_gemms(ncalls + 1, true);
+ time -= tare;
+
auto avg = time / ncalls;

/* Calculate and display performance */
@@ -114,8 +154,14 @@ bool test(queue &Q, int M, int N, int K)
flops *= 1e-3;
unit = 'P';
}
-
- std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
+ if (result != 0){
+ std::cout << "gemm FAILS" << " for type: " << type_string<T>() << " on " << result <<" times run!"<< "\n";
+ }
+ else{
+ std::cout << "gemm Passes" << " for type: " << type_string<T>() << "!\n";
+ std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
+ }
+

/* Free data */
free(C, Q);
--
2.34.1

Loading