Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions visual-ai/Wan2.2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Docker setup

Build docker image:

```bash
bash build.sh
```

Run docker image:

```bash
export DOCKER_IMAGE=llm-scaler-visualai:latest-wan2.2
export CONTAINER_NAME=wan-2.2
export MODEL_DIR=<your_model_dir>
sudo docker run -itd \
--privileged \
--net=host \
--device=/dev/dri \
-e no_proxy=localhost,127.0.0.1 \
--name=$CONTAINER_NAME \
-v $MODEL_DIR:/llm/models/ \
--shm-size="16g" \
--entrypoint=/bin/bash \
$DOCKER_IMAGE

docker exec -it wan-2.2 bash
```

Run Wan 2.2 demo on Single B60 GPU:
```bash
python3 generate.py --task ti2v-5B --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/ --offload_model True --t5_cpu --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --convert_model_dtype --frame_num 101 --sample_steps 50
```

Run Wan 2.2 demo on 2 * B60 GPUs:
```bash
torchrun --nproc_per_node=2 generate.py --task ti2v-5B --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/ --ulysses_size 2 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --offload_model True --t5_cpu --convert_model_dtype --frame_num 101 --sample_steps 50
```
6 changes: 6 additions & 0 deletions visual-ai/Wan2.2/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
set -x

export HTTP_PROXY=<your_http_proxy>
export HTTPS_PROXY=<your_https_proxy>

docker build -f ./docker/Dockerfile . -t llm-scaler-visualai:latest-wan2.1 --build-arg https_proxy=$HTTPS_PROXY --build-arg http_proxy=$HTTP_PROXY
109 changes: 109 additions & 0 deletions visual-ai/Wan2.2/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# ======== Base Stage ========
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base

ARG https_proxy
ARG http_proxy

# Add Intel oneAPI repo and PPA for GPU support
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics-testing

# Install dependencies and Python 3.10
RUN apt-get update -y && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update -y && \
apt-get install -y python3.10 python3.10-distutils python3.10-dev && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
ffmpeg \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
wget \
vim \
linux-libc-dev && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
# Install Intel GPU runtime packages
apt-get update -y && \
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
apt-get install -y intel-oneapi-dpcpp-ct=2025.0.1-17 && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# pin compute runtime version
RUN mkdir /tmp/neo && \
cd /tmp/neo && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-core-2_2.12.5+19302_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-opencl-2_2.12.5+19302_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc-dbgsym_25.22.33944.8-0_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc_25.22.33944.8-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd-dbgsym_25.22.33944.8-0_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd_25.22.33944.8-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libigdgmm12_22.7.0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1-dbgsym_25.22.33944.8-0_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1_25.22.33944.8-0_amd64.deb && \
dpkg -i *.deb

WORKDIR /llm
COPY ./patches/wan22_for_multi_arc.patch /tmp/
COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/

# Set environment variables early
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

# ======= Add oneCCL build =======
RUN apt-get update && apt-get install -y \
cmake \
g++ \
&& rm -rf /var/lib/apt/lists/*

# Build 1ccl
RUN git clone https://github.com/oneapi-src/oneCCL.git && \
cd oneCCL && \
git checkout def870543749186b6f38cdc865b44d52174c7492 && \
git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
mkdir build && cd build && \
export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
/usr/bin/cmake .. \
-DCMAKE_INSTALL_PREFIX=_install \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCOMPUTE_BACKEND=dpcpp \
-DCCL_ENABLE_ARCB=1 && \
make -j && make install && \
mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
cd /opt/intel/oneapi/ccl/ && \
ln -snf 2021.15.3 latest

# Configure environment to source oneAPI
RUN echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc

SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]

ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.10/dist-packages/torch/lib:$LD_LIBRARY_PATH"

RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu && \
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
pip install bigdl-core-xe-all==2.6.0 --extra-index-url https://download.pytorch.org/whl/xpu && \
apt remove python3-blinker -y

RUN cd /llm && \
git clone https://github.com/Wan-Video/Wan2.2.git && \
cd ./Wan2.2 && \
git checkout 031a9be56cec91e86d140d3d3a74280fb05a9b1c && \
git apply /tmp/wan22_for_multi_arc.patch && \
pip install -r requirements.txt && \
pip install einops && \
pip install cffi

WORKDIR /llm/Wan2.2
125 changes: 125 additions & 0 deletions visual-ai/Wan2.2/patches/0001-oneccl-align-global-V0.1.1.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
From: YongZhuIntel <yong.zhu@intel.com>
Date: Wed, 21 May 2025 09:37:06 +0800
Subject: [PATCH] oneccl align global V0.1.1

base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)

Build:
1. mkdir build; cd build
2. source /opt/intel/oneapi/setvars.sh
3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install

print bandwidth in benchmark
---
examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
examples/benchmark/src/benchmark.cpp | 7 +++--
2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 08a3625..bff6275 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
double max_time,
double avg_time,
double stddev,
- double wait_avg_time) {
+ double wait_avg_time,
+ double algbw,
+ double busbw) {
std::ofstream csvf;
csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);

@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
<< "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
<< ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
<< iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
- << stddev << "," << wait_avg_time << std::endl;
+ << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
}
csvf.close();
}
@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
max_time /= iter_count;

size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
+
+ double algbw = bytes*1000/total_avg_time/1024/1024;
+
+ if (ncolls == 1) {
+ if (options.coll_names.front() == "allgather" ||
+ options.coll_names.front() == "allgatherv" ||
+ options.coll_names.front() == "reducescatter" ||
+ options.coll_names.front() == "alltoall" ||
+ options.coll_names.front() == "alltoallv") {
+ algbw = algbw * nranks;
+ }
+ }
+
+ double busbw = algbw;
+ if (ncolls == 1) {
+ if (options.coll_names.front() == "allreduce") {
+ busbw = algbw * 2 * (nranks -1) / nranks;
+ } else if (options.coll_names.front() == "allgather" ||
+ options.coll_names.front() == "allgatherv" ||
+ options.coll_names.front() == "reducescatter" ||
+ options.coll_names.front() == "alltoall" ||
+ options.coll_names.front() == "alltoallv") {
+ busbw = algbw * (nranks -1) / nranks;
+ }
+ }
+
std::stringstream ss;
ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
<< elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
<< std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
- << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
+ << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
+ << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
+ << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);

if (show_extened_info(options.show_additional_info)) {
ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
max_time,
total_avg_time,
stddev,
- wait_avg_time);
+ wait_avg_time,
+ algbw,
+ busbw);
}
}

diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
index d90fb9b..78957f2 100644
--- a/examples/benchmark/src/benchmark.cpp
+++ b/examples/benchmark/src/benchmark.cpp
@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
<< "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
<< std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
<< std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
- << "stddev[%]";
+ << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
+ << "busbw[GB/s]";

if (show_extened_info(options.show_additional_info)) {
ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
<< "t_max[usec],"
<< "t_avg[usec],"
<< "stddev[%],"
- << "wait_t_avg[usec]" << std::endl;
+ << "wait_t_avg[usec],"
+ << "algbw[GB/s],"
+ << "busbw[GB/s]" << std::endl;
csvf.close();
}

--
2.25.1

Loading