Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 89 additions & 63 deletions docker/llm/serving/xpu/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,62 @@
FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
# First stage: build oneccl
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS build

ARG http_proxy
ARG https_proxy

ENV TZ=Asia/Shanghai
ENV PYTHONUNBUFFERED=1

ARG PIP_NO_CACHE_DIR=false

ADD ./ccl_torch.patch /tmp/

RUN apt-get update && \
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
env DEBIAN_FRONTEND=noninteractive apt-get update && \
# add-apt-repository requires gnupg, gpg-agent, software-properties-common
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \
# Add Python 3.11 PPA repository
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y --no-install-recommends python3.11 git curl wget && \
rm /usr/bin/python3 && \
ln -s /usr/bin/python3.11 /usr/bin/python3 && \
ln -s /usr/bin/python3 /usr/bin/python && \
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
# Install FastChat from source requires PEP 660 support
python3 get-pip.py && \
rm get-pip.py && \
pip install --upgrade requests argparse urllib3 && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
# If we do not install this compute-runtime, we will fail the build later
mkdir -p /tmp/neo && \
cd /tmp/neo && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
dpkg -i *.deb && \
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
mkdir /build && \
cd /build && \
git clone https://github.com/intel/torch-ccl.git && \
cd torch-ccl && \
git checkout ccl_torch2.5.0+xpu && \
git submodule sync && \
git submodule update --init --recursive && \
# This patch will enable build torch-ccl with pytorch 2.6 environment
git apply /tmp/ccl_torch.patch && \
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel
# File path: /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl

FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04

COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl

ARG http_proxy
ARG https_proxy
Expand All @@ -11,22 +69,12 @@ ENV VLLM_RPC_TIMEOUT=100000

# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg && \
apt-get update && \
RUN apt-get update && \
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
env DEBIAN_FRONTEND=noninteractive apt-get update && \
# add-apt-repository requires gnupg, gpg-agent, software-properties-common
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common kmod && \
# Add Python 3.11 PPA repository
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y --no-install-recommends python3.11 git curl wget && \
Expand All @@ -35,81 +83,59 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
ln -s /usr/bin/python3 /usr/bin/python && \
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
# Install FastChat from source requires PEP 660 support
python3 get-pip.py && \
rm get-pip.py && \
pip install --upgrade requests argparse urllib3 && \
pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
pip install transformers_stream_generator einops tiktoken && \
pip install --upgrade colorama && \
# Download all-in-one benchmark and examples
git clone https://github.com/intel-analytics/ipex-llm && \
# The following comment segment is used when building from source...
# cd ipex-llm && \
# git fetch origin pull/12338/head:local_pr && \
# git checkout local_pr && \
# pip uninstall -y ipex-llm && \
# cd python/llm && \
# python setup.py install && \
# cd ../../../ && \
git clone https://github.com/intel/ipex-llm.git && \
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
rm -rf ./ipex-llm && \
# Install vllm dependencies
pip install --upgrade fastapi && \
pip install --upgrade "uvicorn[standard]" && \
# Download vLLM-Serving
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
rm -rf ./ipex-llm && \
# Install torch-ccl
cd /tmp/ && \
pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
# Internal oneccl
wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.5-release/oneccl_wks_installer_2024.0.0.6.5.sh && \
bash oneccl_wks_installer_2024.0.0.6.5.sh && \
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
cd torch-ccl && \
patch -p1 < /tmp/oneccl-binding.patch && \
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install && \
pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \
# install Internal oneccl
cd /opt && \
wget https://sourceforge.net/projects/oneccl-wks/files/2025.0.0.6.6-release/oneccl_wks_installer_2025.0.0.6.6.sh && \
bash oneccl_wks_installer_2025.0.0.6.6.sh && \
apt-get update && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
# apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
# Install compute runtime
mkdir -p /tmp/neo && \
cd /tmp/neo && \
wget https://github.com/oneapi-src/level-zero/releases/download/v1.18.5/level-zero_1.18.5+u22.04_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-core_1.0.17791.9_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-opencl_1.0.17791.9_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-level-zero-gpu_1.6.31294.12_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-opencl-icd_24.39.31294.12_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/libigdgmm12_22.5.2_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
dpkg -i *.deb && \
rm -rf /tmp/neo && \
mkdir -p /llm && \
cd /llm && \
git clone -b 0.6.2 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
rm -rf /tmp/neo && \
# Install vllm
git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
cd /llm/vllm && \
pip install setuptools-scm && \
pip install --upgrade cmake && \
VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \
# pip install -r /llm/vllm/requirements-xpu.txt && \
# VLLM_TARGET_DEVICE=xpu python setup.py install && \
pip install mpi4py fastapi uvicorn openai && \
pip install gradio==4.43.0 && \
# pip install transformers==4.44.2 && \
# patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \
pip install ray && \
patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch
pip install ray

COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/
COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/
COPY ./vllm_offline_inference_vision_language.py /llm/
COPY ./payload-1024.lua /llm/
COPY ./start-vllm-service.sh /llm/
COPY ./benchmark_vllm_throughput.py /llm/
COPY ./benchmark_vllm_latency.py /llm/
COPY ./start-fastchat-service.sh /llm/
COPY ./start-pp_serving-service.sh /llm/
COPY ./start-lightweight_serving-service.sh /llm/

ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
COPY ./payload-1024.lua /llm/
COPY ./start-vllm-service.sh /llm/
COPY ./benchmark_vllm_throughput.py /llm/
COPY ./benchmark_vllm_latency.py /llm/
COPY ./start-pp_serving-service.sh /llm/

WORKDIR /llm/
82 changes: 1 addition & 81 deletions docker/llm/serving/xpu/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe
An example could be:
```bash
#/bin/bash
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest

sudo docker run -itd \
--net=host \
Expand Down Expand Up @@ -59,86 +59,6 @@ To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to t
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.


#### FastChat serving engine

To set up model serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#) or follow these quick steps to deploy a demo.

##### Quick Setup for FastChat with IPEX-LLM

1. **Start the Docker Container**

Run the following command to launch a Docker container with device access:

```bash
#/bin/bash
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest

sudo docker run -itd \
--net=host \
--device=/dev/dri \
--name=demo-container \
# Example: map host model directory to container
-v /LLM_MODELS/:/llm/models/ \
--shm-size="16g" \
# Optional: set proxy if needed
-e http_proxy=... \
-e https_proxy=... \
-e no_proxy="127.0.0.1,localhost" \
$DOCKER_IMAGE
```

2. **Start the FastChat Service**

Enter the container and start the FastChat service:
```bash
#/bin/bash

# This command assumes that you have mapped the host model directory to the container
# and the model directory is /llm/models/
# we take Yi-1.5-34B as an example, and you can replace it with your own model

ps -ef | grep "fastchat" | awk '{print $2}' | xargs kill -9
pip install -U gradio==4.43.0

# start controller
python -m fastchat.serve.controller &

export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2

export TORCH_LLM_ALLREDUCE=0
export CCL_DG2_ALLREDUCE=1
# CCL needed environment variables
export CCL_WORKER_COUNT=4
# pin ccl worker to cores
# export CCL_WORKER_AFFINITY=32,33,34,35
export FI_PROVIDER=shm
export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1

source /opt/intel/1ccl-wks/setvars.sh

python -m ipex_llm.serving.fastchat.vllm_worker \
--model-path /llm/models/Yi-1.5-34B \
--device xpu \
--enforce-eager \
--disable-async-output-proc \
--distributed-executor-backend ray \
--dtype float16 \
--load-in-low-bit fp8 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.9 \
--max-model-len 4096 \
--max-num-batched-tokens 8000 &

sleep 120

python -m fastchat.serve.gradio_web_server &
```

This quick setup allows you to deploy FastChat with IPEX-LLM efficiently.

#### vLLM serving engine

To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md).
Expand Down
Loading