From 23539b1c297bbc3816b78bb80af4ef1d10d1cf21 Mon Sep 17 00:00:00 2001 From: jindonghe1 Date: Tue, 2 Sep 2025 14:45:45 +0800 Subject: [PATCH] feat: update the a2 base operator package to RC2 and suppport qwen2.5-vl/qwen3-moe on a3. --- CMakeLists.txt | 10 ++++---- README.md | 12 +++++----- README_zh.md | 12 +++++----- cibuild/build.sh | 2 +- docs/en/getting_started/compile.md | 6 ++--- docs/zh/getting_started/compile.md | 6 ++--- .../distributed_runtime/worker_server.cpp | 4 ---- xllm/core/layers/npu/CMakeLists.txt | 8 +++---- xllm/core/layers/npu/atb_head_impl.cpp | 7 ------ .../layers/npu/deepseek_v2_decoder_layer.cpp | 8 +++++-- xllm/models/CMakeLists.txt | 2 +- xllm/models/clip_text_model.h | 2 -- xllm/models/models.h | 24 +++++++++---------- 13 files changed, 46 insertions(+), 57 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e0628c64..bb588f839 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,25 +28,25 @@ if(USE_NPU) if(DEVICE_TYPE STREQUAL "USE_A3") message("downloading a3 arm xllm kernels") file(DOWNLOAD - "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.0.0-Linux.a3.arm.rpm" + "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a3.arm.rpm" "${CMAKE_BINARY_DIR}/xllm_kernels.rpm" ) else() - message("downloading a2 arm xllm_kernels") if(DEVICE_ARCH STREQUAL "ARM") + message("downloading a2 arm xllm_kernels") file(DOWNLOAD - "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.0-Linux.a2.arm.rpm" + "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a2.arm.rpm" "${CMAKE_BINARY_DIR}/xllm_kernels.rpm" ) else() message("downloading a3 x86 xllm_kernels") file(DOWNLOAD - "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.0-Linux.a2.x86.rpm" + "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a2.x86.rpm" "${CMAKE_BINARY_DIR}/xllm_kernels.rpm" ) endif() endif() - execute_process(COMMAND rpm -ivh "${CMAKE_BINARY_DIR}/xllm_kernels.rpm") + execute_process(COMMAND rpm -ivh --replacepkgs --replacefiles "${CMAKE_BINARY_DIR}/xllm_kernels.rpm") file(WRITE "${CMAKE_BINARY_DIR}/.xllm_installed" "") endif() diff --git a/README.md b/README.md index 928bffe68..c0c82a206 100755 --- a/README.md +++ b/README.md @@ -112,18 +112,18 @@ Supported models list: First, download the image we provide: ```bash # A2 x86 -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts # A2 arm -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 # A3 arm -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 # or # A2 x86 -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts # A2 arm -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 # A3 arm -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 ``` Then create the corresponding container: ```bash diff --git a/README_zh.md b/README_zh.md index caa13afb8..f805f15f9 100755 --- a/README_zh.md +++ b/README_zh.md @@ -112,18 +112,18 @@ xLLM 提供了强大的智能计算能力,通过硬件系统的算力优化与 首先下载我们提供的镜像: ```bash # A2 x86 -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts # A2 arm -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 # A3 arm -docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 +docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 # 或者 # A2 x86 -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts # A2 arm -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 # A3 arm -docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 +docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 ``` 然后创建对应的容器 ```bash diff --git a/cibuild/build.sh b/cibuild/build.sh index ad4763e51..034fe99c6 100644 --- a/cibuild/build.sh +++ b/cibuild/build.sh @@ -6,7 +6,7 @@ function error() { exit 1 } -IMAGE="9d0b6f5a80f6" +IMAGE="quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts" RUN_OPTS=( --rm diff --git a/docs/en/getting_started/compile.md b/docs/en/getting_started/compile.md index 5f0077958..b82a8ac86 100644 --- a/docs/en/getting_started/compile.md +++ b/docs/en/getting_started/compile.md @@ -14,9 +14,9 @@ sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host | Device | Arch | Images | |:---------:|:-----------:|:-------------:| -| A2 | x86 | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts | -| A2 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 | -| A3 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 | +| A2 | x86 | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts | +| A2 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 | +| A3 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 | ## Installation diff --git a/docs/zh/getting_started/compile.md b/docs/zh/getting_started/compile.md index 81c2f42ff..50133c2ba 100644 --- a/docs/zh/getting_started/compile.md +++ b/docs/zh/getting_started/compile.md @@ -15,9 +15,9 @@ sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host | Device | Arch | Images | |:---------:|:-----------:|:-------------:| -| A2 | x86 | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts | -| A2 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 | -| A3 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 | +| A2 | x86 | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts | +| A2 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 | +| A3 | arm | xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 | ## 安装 diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp index e6be556cb..8806b1fbd 100644 --- a/xllm/core/distributed_runtime/worker_server.cpp +++ b/xllm/core/distributed_runtime/worker_server.cpp @@ -131,11 +131,7 @@ void WorkerServer::create_server(const runtime::Options& options, #if defined(USE_NPU) atb_speed::base::Mapping mapping; mapping.ParseParam(mapping_data); -#if defined(USE_A3) mapping.InitGlobalCommDomain(FLAGS_communication_backend); -#else - mapping.InitCommDomain(FLAGS_communication_backend); -#endif auto moeEpParallelInfo = mapping.Get(atb_speed::base::MOE_EP); auto dispatchAndCombinecommDomain = atb_speed::GetSingleton().GetCommDomain( diff --git a/xllm/core/layers/npu/CMakeLists.txt b/xllm/core/layers/npu/CMakeLists.txt index da068f1a7..66b16eed2 100755 --- a/xllm/core/layers/npu/CMakeLists.txt +++ b/xllm/core/layers/npu/CMakeLists.txt @@ -20,8 +20,8 @@ cc_library( atb_head_impl.h siglip_encoder_layer.h multi_head_attention.h - $<$:qwen2_5_vision_encoder_layer.h> - $<$:qwen3_moe_decoder_layer.h> + qwen2_5_vision_encoder_layer.h + qwen3_moe_decoder_layer.h atb_linear.h atb_parallel_linear.h buffer/atb_buffer.h @@ -30,8 +30,8 @@ cc_library( llama_decoder_layer.cpp qwen2_decoder_layer.cpp qwen3_decoder_layer.cpp - $<$:qwen2_5_vision_encoder_layer.cpp> - $<$:qwen3_moe_decoder_layer.cpp> + qwen2_5_vision_encoder_layer.cpp + qwen3_moe_decoder_layer.cpp attn_mask.cpp atb_base.cpp word_embedding.cpp diff --git a/xllm/core/layers/npu/atb_head_impl.cpp b/xllm/core/layers/npu/atb_head_impl.cpp index a4d3740be..00a684708 100644 --- a/xllm/core/layers/npu/atb_head_impl.cpp +++ b/xllm/core/layers/npu/atb_head_impl.cpp @@ -60,16 +60,9 @@ void AtbLmHeadImpl::param_from_args(atb_speed::common::LmHeadParam& param, parallelInfo.rankIds.size(); param.linearParallelParam.tensorParallelInfo.backend = FLAGS_communication_backend; -#if defined(USE_A3) parallelInfo.InitCommDomain( param.linearParallelParam.tensorParallelInfo.hcommInfo, param.linearParallelParam.tensorParallelInfo.commDomain); -#else - param.linearParallelParam.tensorParallelInfo.hcommInfo = - parallelInfo.hcclComm; - param.linearParallelParam.tensorParallelInfo.commDomain = - parallelInfo.commDomain; -#endif } } } diff --git a/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp b/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp index 313d9138b..d71b567ff 100644 --- a/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp +++ b/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp @@ -437,9 +437,9 @@ void DeepseekV2DecoderImpl::initialize_basic_parameters( } param.maskfree = true; // TODO param.enableSwiGLUQuantForSharedExperts = false; // TODO -#if defined(USE_A3) param.scaledTopk = -1; param.enableATBGateMatmul = 1; +#if defined(USE_A3) param.enableLcocAll2All = 1; #endif num_key_value_heads_ = static_cast(args.n_kv_heads().value()); @@ -526,7 +526,11 @@ void DeepseekV2DecoderImpl::initialize_mlp_parameters( param.dispatchAndCombineHcclComm = parallel_args.dispatchAndCombineHcclComm(); param.dispatchAndCombinecommDomain = parallel_args.dispatchAndCombinecommDomain(); - +#if defined(USE_A3) + param.enableIndexGmm = false; +#else + param.enableIndexGmm = true; +#endif if (layer_id_ >= param.firstKDenseReplace) { param.enableQkvdownDp = false; param.enableSharedExpertDp = false; diff --git a/xllm/models/CMakeLists.txt b/xllm/models/CMakeLists.txt index b2e912d4e..2f3778df0 100755 --- a/xllm/models/CMakeLists.txt +++ b/xllm/models/CMakeLists.txt @@ -33,7 +33,7 @@ cc_library( $<$:llama.h> $<$:minicpmv.h> $<$:qwen2.h> - $<$:qwen2_5_vl.h> + $<$:qwen2_5_vl.h> $<$:qwen3.h> $<$:qwen3_moe.h> $<$:qwen_base.h> diff --git a/xllm/models/clip_text_model.h b/xllm/models/clip_text_model.h index 806c5a53d..78055595c 100755 --- a/xllm/models/clip_text_model.h +++ b/xllm/models/clip_text_model.h @@ -17,9 +17,7 @@ #include "processors/input_processor.h" #include "processors/pywarpper_image_processor.h" #include "processors/qwen2_vl_image_processor.h" -#if defined(USE_A2) #include "qwen2_5_vl.h" -#endif #include "xllm_kernels/core/include/atb_speed/log.h" namespace xllm::hf { diff --git a/xllm/models/models.h b/xllm/models/models.h index d339dd638..22c6723bd 100755 --- a/xllm/models/models.h +++ b/xllm/models/models.h @@ -16,20 +16,18 @@ limitations under the License. #pragma once #if defined(USE_NPU) -#include "deepseek_v2.h" // IWYU pragma: keep -#include "deepseek_v2_mtp.h" // IWYU pragma: keep -#include "deepseek_v3.h" // IWYU pragma: keep -#include "kimi_k2.h" // IWYU pragma: keep -#include "llama.h" // IWYU pragma: keep -#include "llama3.h" // IWYU pragma: keep -#include "minicpmv.h" // IWYU pragma: keep -#include "qwen2.h" // IWYU pragma: keep -#include "qwen3.h" // IWYU pragma: keep -#include "qwen3_embedding.h" // IWYU pragma: keep -#include "qwen_base.h" // IWYU pragma: keep -#if defined(USE_A2) +#include "deepseek_v2.h" // IWYU pragma: keep +#include "deepseek_v2_mtp.h" // IWYU pragma: keep +#include "deepseek_v3.h" // IWYU pragma: keep #include "flux/pipeline_flux.h" // IWYU pragma: keep +#include "kimi_k2.h" // IWYU pragma: keep +#include "llama.h" // IWYU pragma: keep +#include "llama3.h" // IWYU pragma: keep +#include "minicpmv.h" // IWYU pragma: keep +#include "qwen2.h" // IWYU pragma: keep #include "qwen2_5_vl.h" // IWYU pragma: keep +#include "qwen3.h" // IWYU pragma: keep +#include "qwen3_embedding.h" // IWYU pragma: keep #include "qwen3_moe.h" // IWYU pragma: keep +#include "qwen_base.h" // IWYU pragma: keep #endif -#endif \ No newline at end of file