intel · xiangyuT · Aug 12, 2025 · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/visual-ai/Wan2.2/README.md b/visual-ai/Wan2.2/README.md
@@ -0,0 +1,37 @@
+# Docker setup
+
+Build docker image:
+
+```bash
+bash build.sh
+```
+
+Run docker image:
+
+```bash
+export DOCKER_IMAGE=llm-scaler-visualai:latest-wan2.2
+export CONTAINER_NAME=wan-2.2
+export MODEL_DIR=<your_model_dir>
+sudo docker run -itd \
+        --privileged \
+        --net=host \
+        --device=/dev/dri \
+        -e no_proxy=localhost,127.0.0.1 \
+        --name=$CONTAINER_NAME \
+        -v $MODEL_DIR:/llm/models/ \
+        --shm-size="16g" \
+        --entrypoint=/bin/bash \
+        $DOCKER_IMAGE
+
+docker exec -it wan-2.2 bash
+```
+
+Run Wan 2.2 demo on Single B60 GPU:
+```bash
+python3 generate.py  --task ti2v-5B --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/  --offload_model True --t5_cpu --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --convert_model_dtype  --frame_num 101 --sample_steps 50 
+```
+
+Run Wan 2.2 demo on 2 * B60 GPUs:
+```bash
+torchrun --nproc_per_node=2 generate.py --task ti2v-5B  --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/ --ulysses_size 2 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --offload_model True --t5_cpu --convert_model_dtype --frame_num 101 --sample_steps 50 
+```
diff --git a/visual-ai/Wan2.2/build.sh b/visual-ai/Wan2.2/build.sh
@@ -0,0 +1,6 @@
+set -x
+
+export HTTP_PROXY=<your_http_proxy>
+export HTTPS_PROXY=<your_https_proxy>
+
+docker build -f ./docker/Dockerfile . -t llm-scaler-visualai:latest-wan2.1 --build-arg https_proxy=$HTTPS_PROXY --build-arg http_proxy=$HTTP_PROXY
diff --git a/visual-ai/Wan2.2/docker/Dockerfile b/visual-ai/Wan2.2/docker/Dockerfile
@@ -0,0 +1,109 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# ======== Base Stage ========
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base
+
+ARG https_proxy
+ARG http_proxy
+
+# Add Intel oneAPI repo and PPA for GPU support
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-testing
+
+# Install dependencies and Python 3.10
+RUN apt-get update -y && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update -y && \
+    apt-get install -y python3.10 python3.10-distutils python3.10-dev && \
+    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        curl \
+        ffmpeg \
+        git \
+        libsndfile1 \
+        libsm6 \
+        libxext6 \
+        libgl1 \
+        lsb-release \
+        numactl \
+        wget \
+        vim \
+        linux-libc-dev && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    # Install Intel GPU runtime packages
+    apt-get update -y && \
+    apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
+    apt-get install -y intel-oneapi-dpcpp-ct=2025.0.1-17 && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# pin compute runtime version
+RUN mkdir /tmp/neo && \
+    cd /tmp/neo && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-core-2_2.12.5+19302_amd64.deb && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-opencl-2_2.12.5+19302_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc-dbgsym_25.22.33944.8-0_amd64.ddeb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc_25.22.33944.8-0_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd-dbgsym_25.22.33944.8-0_amd64.ddeb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd_25.22.33944.8-0_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libigdgmm12_22.7.0_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1-dbgsym_25.22.33944.8-0_amd64.ddeb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1_25.22.33944.8-0_amd64.deb && \
+    dpkg -i *.deb
+
+WORKDIR /llm
+COPY ./patches/wan22_for_multi_arc.patch /tmp/
+COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/
+
+# Set environment variables early
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+# ======= Add oneCCL build =======
+RUN apt-get update && apt-get install -y \
+    cmake \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Build 1ccl
+RUN git clone https://github.com/oneapi-src/oneCCL.git && \
+    cd oneCCL && \
+    git checkout def870543749186b6f38cdc865b44d52174c7492 && \
+    git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
+    mkdir build && cd build && \
+    export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
+    /usr/bin/cmake .. \
+        -DCMAKE_INSTALL_PREFIX=_install \
+        -DCMAKE_C_COMPILER=icx \
+        -DCMAKE_CXX_COMPILER=icpx \
+        -DCOMPUTE_BACKEND=dpcpp \
+        -DCCL_ENABLE_ARCB=1 && \
+    make -j && make install && \
+    mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
+    cd /opt/intel/oneapi/ccl/ && \
+    ln -snf 2021.15.3 latest
+
+# Configure environment to source oneAPI
+RUN echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
+
+SHELL ["bash", "-c"]
+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
+
+ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.10/dist-packages/torch/lib:$LD_LIBRARY_PATH"
+
+RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu && \
+    pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
+    pip install bigdl-core-xe-all==2.6.0 --extra-index-url https://download.pytorch.org/whl/xpu && \
+    apt remove python3-blinker -y
+
+RUN cd /llm && \
+    git clone https://github.com/Wan-Video/Wan2.2.git && \
+    cd ./Wan2.2 && \
+    git checkout 031a9be56cec91e86d140d3d3a74280fb05a9b1c && \
+    git apply /tmp/wan22_for_multi_arc.patch && \
+    pip install -r requirements.txt && \
+    pip install einops && \
+    pip install cffi
+
+WORKDIR /llm/Wan2.2
diff --git a/visual-ai/Wan2.2/patches/0001-oneccl-align-global-V0.1.1.patch b/visual-ai/Wan2.2/patches/0001-oneccl-align-global-V0.1.1.patch
@@ -0,0 +1,125 @@
+From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
+From: YongZhuIntel <yong.zhu@intel.com>
+Date: Wed, 21 May 2025 09:37:06 +0800
+Subject: [PATCH] oneccl align global V0.1.1
+
+base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)
+
+Build:
+       1. mkdir build; cd build
+       2. source /opt/intel/oneapi/setvars.sh
+       3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
+       4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install
+
+print bandwidth in benchmark
+---
+ examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
+ examples/benchmark/src/benchmark.cpp     |  7 +++--
+ 2 files changed, 41 insertions(+), 6 deletions(-)
+
+diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
+index 08a3625..bff6275 100644
+--- a/examples/benchmark/include/benchmark.hpp
++++ b/examples/benchmark/include/benchmark.hpp
+@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
+                   double max_time,
+                   double avg_time,
+                   double stddev,
+-                  double wait_avg_time) {
++                  double wait_avg_time,
++		  double algbw,
++		  double busbw) {
+     std::ofstream csvf;
+     csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
+
+@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
+                  << "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
+                  << ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
+                  << iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
+-                 << stddev << "," << wait_avg_time << std::endl;
++                 << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
+         }
+         csvf.close();
+     }
+@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
+         max_time /= iter_count;
+
+         size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
++
++        double algbw = bytes*1000/total_avg_time/1024/1024;
++
++         if (ncolls == 1) {
++             if (options.coll_names.front() == "allgather" ||
++                 options.coll_names.front() == "allgatherv" ||
++                 options.coll_names.front() == "reducescatter" ||
++                 options.coll_names.front() == "alltoall" ||
++                 options.coll_names.front() == "alltoallv") {
++                 algbw = algbw * nranks;
++            }
++         }
++
++        double busbw = algbw;
++        if (ncolls == 1) {
++            if (options.coll_names.front() == "allreduce") {
++                busbw = algbw * 2 * (nranks -1) / nranks;
++            } else if (options.coll_names.front() == "allgather" ||
++                options.coll_names.front() == "allgatherv" ||
++                options.coll_names.front() == "reducescatter" ||
++                options.coll_names.front() == "alltoall" ||
++                options.coll_names.front() == "alltoallv") {
++                busbw = algbw * (nranks -1) / nranks;
++           }
++        }
++
+         std::stringstream ss;
+         ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
+            << elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
+-           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
++           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
++	   << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
++	   << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);
+
+         if (show_extened_info(options.show_additional_info)) {
+             ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
+@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
+                          max_time,
+                          total_avg_time,
+                          stddev,
+-                         wait_avg_time);
++                         wait_avg_time,
++			 algbw,
++			 busbw);
+         }
+     }
+
+diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
+index d90fb9b..78957f2 100644
+--- a/examples/benchmark/src/benchmark.cpp
++++ b/examples/benchmark/src/benchmark.cpp
+@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
+                    << "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
+                    << std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
+                    << std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
+-                   << "stddev[%]";
++                   << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
++		   << "busbw[GB/s]";
+
+                 if (show_extened_info(options.show_additional_info)) {
+                     ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
+@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
+              << "t_max[usec],"
+              << "t_avg[usec],"
+              << "stddev[%],"
+-             << "wait_t_avg[usec]" << std::endl;
++             << "wait_t_avg[usec],"
++             << "algbw[GB/s],"
++             << "busbw[GB/s]" << std::endl;
+         csvf.close();
+     }
+
+-- 
+2.25.1
+