From 23539b1c297bbc3816b78bb80af4ef1d10d1cf21 Mon Sep 17 00:00:00 2001
From: jindonghe1 <jindonghe1@jd.com>
Date: Tue, 2 Sep 2025 14:45:45 +0800
Subject: [PATCH] feat: update the a2 base operator package to RC2 and suppport
 qwen2.5-vl/qwen3-moe on a3.

---
 CMakeLists.txt                                | 10 ++++----
 README.md                                     | 12 +++++-----
 README_zh.md                                  | 12 +++++-----
 cibuild/build.sh                              |  2 +-
 docs/en/getting_started/compile.md            |  6 ++---
 docs/zh/getting_started/compile.md            |  6 ++---
 .../distributed_runtime/worker_server.cpp     |  4 ----
 xllm/core/layers/npu/CMakeLists.txt           |  8 +++----
 xllm/core/layers/npu/atb_head_impl.cpp        |  7 ------
 .../layers/npu/deepseek_v2_decoder_layer.cpp  |  8 +++++--
 xllm/models/CMakeLists.txt                    |  2 +-
 xllm/models/clip_text_model.h                 |  2 --
 xllm/models/models.h                          | 24 +++++++++----------
 13 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e0628c64..bb588f839 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,25 +28,25 @@ if(USE_NPU)
     if(DEVICE_TYPE STREQUAL "USE_A3")
         message("downloading a3 arm xllm kernels")
         file(DOWNLOAD 
-            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.0.0-Linux.a3.arm.rpm"
+            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a3.arm.rpm"
             "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
         )
     else()  
-       message("downloading a2 arm xllm_kernels")
       if(DEVICE_ARCH STREQUAL "ARM")
+          message("downloading a2 arm xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.0-Linux.a2.arm.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a2.arm.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       else()
           message("downloading a3 x86 xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.0-Linux.a2.x86.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.2.1-Linux.a2.x86.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       endif()
     endif()
-      execute_process(COMMAND rpm -ivh "${CMAKE_BINARY_DIR}/xllm_kernels.rpm")
+      execute_process(COMMAND rpm -ivh  --replacepkgs --replacefiles "${CMAKE_BINARY_DIR}/xllm_kernels.rpm")
       file(WRITE "${CMAKE_BINARY_DIR}/.xllm_installed" "")
   endif()
 
diff --git a/README.md b/README.md
index 928bffe68..c0c82a206 100755
--- a/README.md
+++ b/README.md
@@ -112,18 +112,18 @@ Supported models list:
 First, download the image we provide:
 ```bash
 # A2 x86
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts
 # A2 arm
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64
 # A3 arm
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64
 # or
 # A2 x86
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts
 # A2 arm
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64
 # A3 arm
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64
 ```
 Then create the corresponding container:
 ```bash
diff --git a/README_zh.md b/README_zh.md
index caa13afb8..f805f15f9 100755
--- a/README_zh.md
+++ b/README_zh.md
@@ -112,18 +112,18 @@ xLLM 提供了强大的智能计算能力，通过硬件系统的算力优化与
 首先下载我们提供的镜像：
 ```bash
 # A2 x86
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts
 # A2 arm
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64
 # A3 arm
-docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64
+docker pull xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64
 # 或者
 # A2 x86
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts
 # A2 arm
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64
 # A3 arm
-docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64
+docker pull quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64
 ```
 然后创建对应的容器
 ```bash
diff --git a/cibuild/build.sh b/cibuild/build.sh
index ad4763e51..034fe99c6 100644
--- a/cibuild/build.sh
+++ b/cibuild/build.sh
@@ -6,7 +6,7 @@ function error() {
   exit 1
 }
 
-IMAGE="9d0b6f5a80f6"
+IMAGE="quay.io/jd_xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts"
 
 RUN_OPTS=(
   --rm
diff --git a/docs/en/getting_started/compile.md b/docs/en/getting_started/compile.md
index 5f0077958..b82a8ac86 100644
--- a/docs/en/getting_started/compile.md
+++ b/docs/en/getting_started/compile.md
@@ -14,9 +14,9 @@ sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host
 
 | Device    |    Arch     |   Images      |
 |:---------:|:-----------:|:-------------:|
-| A2        |     x86     | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts | 
-| A2        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 |
-| A3        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 |
+| A2        |     x86     | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts | 
+| A2        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 |
+| A3        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 |
 
 
 ## Installation
diff --git a/docs/zh/getting_started/compile.md b/docs/zh/getting_started/compile.md
index 81c2f42ff..50133c2ba 100644
--- a/docs/zh/getting_started/compile.md
+++ b/docs/zh/getting_started/compile.md
@@ -15,9 +15,9 @@ sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host
 
 | Device    |    Arch     |   Images      |
 |:---------:|:-----------:|:-------------:|
-| A2        |     x86     | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts | 
-| A2        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hb-py3.11-oe24.03-lts-aarch64 |
-| A3        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hc-py3.11-oe24.03-lts-aarch64 |
+| A2        |     x86     | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts | 
+| A2        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hb-rc2-py3.11-oe24.03-lts-aarch64 |
+| A3        |     arm     | xllm/xllm-ai:xllm-0.6.0-dev-hc-rc2-py3.11-oe24.03-lts-aarch64 |
 
 
 ## 安装
diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
index e6be556cb..8806b1fbd 100644
--- a/xllm/core/distributed_runtime/worker_server.cpp
+++ b/xllm/core/distributed_runtime/worker_server.cpp
@@ -131,11 +131,7 @@ void WorkerServer::create_server(const runtime::Options& options,
 #if defined(USE_NPU)
   atb_speed::base::Mapping mapping;
   mapping.ParseParam(mapping_data);
-#if defined(USE_A3)
   mapping.InitGlobalCommDomain(FLAGS_communication_backend);
-#else
-  mapping.InitCommDomain(FLAGS_communication_backend);
-#endif
   auto moeEpParallelInfo = mapping.Get(atb_speed::base::MOE_EP);
   auto dispatchAndCombinecommDomain =
       atb_speed::GetSingleton<atb_speed::ExternalCommManager>().GetCommDomain(
diff --git a/xllm/core/layers/npu/CMakeLists.txt b/xllm/core/layers/npu/CMakeLists.txt
index da068f1a7..66b16eed2 100755
--- a/xllm/core/layers/npu/CMakeLists.txt
+++ b/xllm/core/layers/npu/CMakeLists.txt
@@ -20,8 +20,8 @@ cc_library(
     atb_head_impl.h
     siglip_encoder_layer.h
     multi_head_attention.h
-    $<$<BOOL:${USE_A2}>:qwen2_5_vision_encoder_layer.h>
-    $<$<BOOL:${USE_A2}>:qwen3_moe_decoder_layer.h>
+    qwen2_5_vision_encoder_layer.h
+    qwen3_moe_decoder_layer.h
     atb_linear.h
     atb_parallel_linear.h
     buffer/atb_buffer.h
@@ -30,8 +30,8 @@ cc_library(
     llama_decoder_layer.cpp
     qwen2_decoder_layer.cpp
     qwen3_decoder_layer.cpp
-    $<$<BOOL:${USE_A2}>:qwen2_5_vision_encoder_layer.cpp>
-    $<$<BOOL:${USE_A2}>:qwen3_moe_decoder_layer.cpp>
+    qwen2_5_vision_encoder_layer.cpp
+    qwen3_moe_decoder_layer.cpp
     attn_mask.cpp
     atb_base.cpp
     word_embedding.cpp
diff --git a/xllm/core/layers/npu/atb_head_impl.cpp b/xllm/core/layers/npu/atb_head_impl.cpp
index a4d3740be..00a684708 100644
--- a/xllm/core/layers/npu/atb_head_impl.cpp
+++ b/xllm/core/layers/npu/atb_head_impl.cpp
@@ -60,16 +60,9 @@ void AtbLmHeadImpl::param_from_args(atb_speed::common::LmHeadParam& param,
           parallelInfo.rankIds.size();
       param.linearParallelParam.tensorParallelInfo.backend =
           FLAGS_communication_backend;
-#if defined(USE_A3)
       parallelInfo.InitCommDomain(
           param.linearParallelParam.tensorParallelInfo.hcommInfo,
           param.linearParallelParam.tensorParallelInfo.commDomain);
-#else
-      param.linearParallelParam.tensorParallelInfo.hcommInfo =
-          parallelInfo.hcclComm;
-      param.linearParallelParam.tensorParallelInfo.commDomain =
-          parallelInfo.commDomain;
-#endif
     }
   }
 }
diff --git a/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp b/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp
index 313d9138b..d71b567ff 100644
--- a/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp
+++ b/xllm/core/layers/npu/deepseek_v2_decoder_layer.cpp
@@ -437,9 +437,9 @@ void DeepseekV2DecoderImpl::initialize_basic_parameters(
   }
   param.maskfree = true;                            // TODO
   param.enableSwiGLUQuantForSharedExperts = false;  // TODO
-#if defined(USE_A3)
   param.scaledTopk = -1;
   param.enableATBGateMatmul = 1;
+#if defined(USE_A3)
   param.enableLcocAll2All = 1;
 #endif
   num_key_value_heads_ = static_cast<int>(args.n_kv_heads().value());
@@ -526,7 +526,11 @@ void DeepseekV2DecoderImpl::initialize_mlp_parameters(
   param.dispatchAndCombineHcclComm = parallel_args.dispatchAndCombineHcclComm();
   param.dispatchAndCombinecommDomain =
       parallel_args.dispatchAndCombinecommDomain();
-
+#if defined(USE_A3)
+  param.enableIndexGmm = false;
+#else
+  param.enableIndexGmm = true;
+#endif
   if (layer_id_ >= param.firstKDenseReplace) {
     param.enableQkvdownDp = false;
     param.enableSharedExpertDp = false;
diff --git a/xllm/models/CMakeLists.txt b/xllm/models/CMakeLists.txt
index b2e912d4e..2f3778df0 100755
--- a/xllm/models/CMakeLists.txt
+++ b/xllm/models/CMakeLists.txt
@@ -33,7 +33,7 @@ cc_library(
     $<$<BOOL:${USE_NPU}>:llama.h>
     $<$<BOOL:${USE_NPU}>:minicpmv.h>
     $<$<BOOL:${USE_NPU}>:qwen2.h>
-    $<$<BOOL:${USE_A2}>:qwen2_5_vl.h>
+    $<$<BOOL:${USE_NPU}>:qwen2_5_vl.h>
     $<$<BOOL:${USE_NPU}>:qwen3.h>
     $<$<BOOL:${USE_NPU}>:qwen3_moe.h>
     $<$<BOOL:${USE_NPU}>:qwen_base.h>
diff --git a/xllm/models/clip_text_model.h b/xllm/models/clip_text_model.h
index 806c5a53d..78055595c 100755
--- a/xllm/models/clip_text_model.h
+++ b/xllm/models/clip_text_model.h
@@ -17,9 +17,7 @@
 #include "processors/input_processor.h"
 #include "processors/pywarpper_image_processor.h"
 #include "processors/qwen2_vl_image_processor.h"
-#if defined(USE_A2)
 #include "qwen2_5_vl.h"
-#endif
 #include "xllm_kernels/core/include/atb_speed/log.h"
 
 namespace xllm::hf {
diff --git a/xllm/models/models.h b/xllm/models/models.h
index d339dd638..22c6723bd 100755
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -16,20 +16,18 @@ limitations under the License.
 #pragma once
 
 #if defined(USE_NPU)
-#include "deepseek_v2.h"      // IWYU pragma: keep
-#include "deepseek_v2_mtp.h"  // IWYU pragma: keep
-#include "deepseek_v3.h"      // IWYU pragma: keep
-#include "kimi_k2.h"          // IWYU pragma: keep
-#include "llama.h"            // IWYU pragma: keep
-#include "llama3.h"           // IWYU pragma: keep
-#include "minicpmv.h"         // IWYU pragma: keep
-#include "qwen2.h"            // IWYU pragma: keep
-#include "qwen3.h"            // IWYU pragma: keep
-#include "qwen3_embedding.h"  // IWYU pragma: keep
-#include "qwen_base.h"        // IWYU pragma: keep
-#if defined(USE_A2)
+#include "deepseek_v2.h"         // IWYU pragma: keep
+#include "deepseek_v2_mtp.h"     // IWYU pragma: keep
+#include "deepseek_v3.h"         // IWYU pragma: keep
 #include "flux/pipeline_flux.h"  // IWYU pragma: keep
+#include "kimi_k2.h"             // IWYU pragma: keep
+#include "llama.h"               // IWYU pragma: keep
+#include "llama3.h"              // IWYU pragma: keep
+#include "minicpmv.h"            // IWYU pragma: keep
+#include "qwen2.h"               // IWYU pragma: keep
 #include "qwen2_5_vl.h"          // IWYU pragma: keep
+#include "qwen3.h"               // IWYU pragma: keep
+#include "qwen3_embedding.h"     // IWYU pragma: keep
 #include "qwen3_moe.h"           // IWYU pragma: keep
+#include "qwen_base.h"           // IWYU pragma: keep
 #endif
-#endif
\ No newline at end of file