jd-opensource · XuZhang99 · Oct 24, 2025 · Oct 16, 2025
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -343,7 +343,7 @@ DEFINE_string(store_metadata_connstring,
               "",
               "The address of the kv cache store metadata service.");
 
-// --- for computation communication parallel ---
+// --- computation communication parallel config ---
 
 DEFINE_bool(
     enable_multi_stream_parallel,
@@ -355,7 +355,7 @@ DEFINE_int32(default_micro_batch_num,
              2,
              "Default use two micro batches for multi-stream parallel.");
 
-// --- for dit ---
+// --- dit config ---
 DEFINE_int32(max_requests_per_batch, 1, "Max number of request per batch.");
 
 // --- continuous kv cache config ---
@@ -377,4 +377,4 @@ DEFINE_int64(cache_size_per_token,
 
 DEFINE_int64(buffer_size_per_seq,
              0,
-             "Buffer size per sequence in bytes, default 0.");
+             "Buffer size per sequence in bytes, default 0.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -189,7 +189,6 @@ DECLARE_int32(max_global_ttft_ms);
 
 DECLARE_int32(max_global_tpot_ms);
 
-// dit
 DECLARE_int32(max_requests_per_batch);
 
 DECLARE_bool(enable_continuous_kvcache);
@@ -198,4 +197,4 @@ DECLARE_int64(granularity_size);
 
 DECLARE_int64(cache_size_per_token);
 
-DECLARE_int64(buffer_size_per_seq);
+DECLARE_int64(buffer_size_per_seq);
diff --git a/xllm/core/framework/batch/batch_input_builder.h b/xllm/core/framework/batch/batch_input_builder.h
@@ -128,7 +128,6 @@ class BatchInputBuilder {
       uint32_t q_seq_len,
       BuilderState* state_ptr = nullptr,
       std::unordered_set<int32_t>* write_block_ids_ptr = nullptr);
-
   void setup_continuous_kv_cache_info(Sequence* sequence,
                                       uint32_t n_kv_cache_tokens,
                                       uint32_t seq_len,

diff --git a/xllm/core/framework/model/model_input_params.h b/xllm/core/framework/model/model_input_params.h
@@ -93,6 +93,7 @@ struct ModelInputParams {
 
     // Copy graph_buffer to device
     params.graph_buffer = safe_to(graph_buffer, device, true);
+
     return params;
   }
 

diff --git a/xllm/core/kernels/CMakeLists.txt b/xllm/core/kernels/CMakeLists.txt
@@ -1,12 +1,24 @@
 include(cc_library)
 
 if(USE_NPU)
-  include_directories(
-    ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
-  )
   add_subdirectory(npu)
 endif()
 
 if(USE_MLU)
   add_subdirectory(mlu)
 endif()
+
+
+cc_library(
+  NAME
+    kernels
+  HDRS
+    param.h
+    ops_api.h
+  SRCS
+    ops_api.cpp
+  DEPS
+    torch
+    $<$<BOOL:${USE_NPU}>:npu_kernels>
+    $<$<BOOL:${USE_MLU}>:mlu_kernels>
+)
diff --git a/xllm/core/kernels/mlu/CMakeLists.txt b/xllm/core/kernels/mlu/CMakeLists.txt
@@ -2,7 +2,6 @@ include(cc_library)
 
 file(GLOB_RECURSE MLU_HEADER_FILES
   "${CMAKE_CURRENT_LIST_DIR}/*.h"
-  "${CMAKE_CURRENT_LIST_DIR}/*.hpp"
 )
 
 file(GLOB_RECURSE MLU_SOURCE_FILES
@@ -11,7 +10,7 @@ file(GLOB_RECURSE MLU_SOURCE_FILES
 
 cc_library(
   NAME
-    xllm_mlu_ops
+    mlu_kernels
   HDRS
     ${MLU_HEADER_FILES}
   SRCS

diff --git a/xllm/core/kernels/mlu/active.cpp b/xllm/core/kernels/mlu/active.cpp
@@ -0,0 +1,38 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::kernel::mlu {
+
+void active(const torch::Tensor& input,
+            torch::Tensor& output,
+            const std::optional<torch::Tensor>& bias,
+            const std::optional<torch::Tensor>& cusum_token_count,
+            const std::string& act_mode,
+            bool is_gated,
+            int start_expert_id,
+            int expert_size) {
+  tmo::torch_api::active(input,
+                         output,
+                         bias,
+                         cusum_token_count,
+                         act_mode,
+                         is_gated,
+                         start_expert_id,
+                         expert_size);
+}
+}  // namespace xllm::kernel::mlu
diff --git a/xllm/core/kernels/mlu/attention.cpp b/xllm/core/kernels/mlu/attention.cpp
@@ -0,0 +1,119 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::kernel::mlu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         torch::Tensor& value,
+                         torch::Tensor& k_cache,
+                         torch::Tensor& v_cache,
+                         const torch::Tensor& slot_mapping,
+                         bool direction) {
+  tmo::torch_api::reshape_paged_cache(
+      key, value, k_cache, v_cache, slot_mapping, direction);
+}
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   torch::Tensor& output,
+                   std::optional<torch::Tensor>& output_lse,
+                   const std::optional<torch::Tensor>& query_start_loc,
+                   const std::optional<torch::Tensor>& seq_start_loc,
+                   const std::optional<torch::Tensor>& alibi_slope,
+                   const std::optional<torch::Tensor>& attn_bias,
+                   const std::optional<torch::Tensor>& q_quant_scale,
+                   const std::optional<torch::Tensor>& k_quant_scale,
+                   const std::optional<torch::Tensor>& v_quant_scale,
+                   const std::optional<torch::Tensor>& out_quant_scale,
+                   const std::optional<torch::Tensor>& block_table,
+                   int max_query_len,
+                   int max_seq_len,
+                   float scale,
+                   bool is_causal,
+                   int window_size_left,
+                   int window_size_right,
+                   const std::string& compute_dtype,
+                   bool return_lse) {
+  tmo::torch_api::flash_attention(query,
+                                  key,
+                                  value,
+                                  output,
+                                  output_lse,
+                                  query_start_loc,
+                                  seq_start_loc,
+                                  alibi_slope,
+                                  attn_bias,
+                                  q_quant_scale,
+                                  k_quant_scale,
+                                  v_quant_scale,
+                                  out_quant_scale,
+                                  block_table,
+                                  max_query_len,
+                                  max_seq_len,
+                                  scale,
+                                  is_causal,
+                                  window_size_left,
+                                  window_size_right,
+                                  compute_dtype,
+                                  return_lse);
+}
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  torch::Tensor& output,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  const torch::Tensor& v_cache,
+                  std::optional<torch::Tensor>& output_lse,
+                  const std::optional<torch::Tensor>& q_quant_scale,
+                  const std::optional<torch::Tensor>& k_cache_quant_scale,
+                  const std::optional<torch::Tensor>& v_cache_quant_scale,
+                  const std::optional<torch::Tensor>& out_quant_scale,
+                  const std::optional<torch::Tensor>& alibi_slope,
+                  const std::optional<torch::Tensor>& mask,
+                  const std::string& compute_dtype,
+                  int max_seq_len,
+                  int window_size_left,
+                  int window_size_right,
+                  float scale,
+                  bool return_lse,
+                  int kv_cache_quant_bit_size) {
+  tmo::torch_api::single_query_cached_kv_attn(query,
+                                              k_cache,
+                                              output,
+                                              block_table,
+                                              seq_lens,
+                                              v_cache,
+                                              output_lse,
+                                              q_quant_scale,
+                                              k_cache_quant_scale,
+                                              v_cache_quant_scale,
+                                              out_quant_scale,
+                                              alibi_slope,
+                                              mask,
+                                              compute_dtype,
+                                              max_seq_len,
+                                              window_size_left,
+                                              window_size_right,
+                                              scale,
+                                              return_lse,
+                                              kv_cache_quant_bit_size);
+}
+
+}  // namespace xllm::kernel::mlu
diff --git a/xllm/core/kernels/mlu/fused_layernorm.cpp b/xllm/core/kernels/mlu/fused_layernorm.cpp
@@ -0,0 +1,53 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::kernel::mlu {
+
+void fused_layernorm(const torch::Tensor& input,
+                     torch::Tensor& output,
+                     const std::optional<torch::Tensor>& residual,
+                     const torch::Tensor& weight,
+                     const std::optional<torch::Tensor>& beta,
+                     const std::optional<torch::Tensor>& bias,
+                     const std::optional<torch::Tensor>& quant_scale,
+                     const std::optional<torch::Tensor>& residual_out,
+                     const std::optional<torch::Tensor>& smooth_quant_scale,
+                     const std::optional<torch::Tensor>& normed_out,
+                     const std::string& mode,
+                     double eps,
+                     bool store_output_before_norm,
+                     bool store_output_after_norm,
+                     bool dynamic_quant) {
+  tmo::torch_api::fused_layernorm(input,
+                                  output,
+                                  residual,
+                                  weight,
+                                  beta,
+                                  bias,
+                                  quant_scale,
+                                  residual_out,
+                                  smooth_quant_scale,
+                                  normed_out,
+                                  mode,
+                                  eps,
+                                  store_output_before_norm,
+                                  store_output_after_norm,
+                                  dynamic_quant);
+}
+
+}  // namespace xllm::kernel::mlu
diff --git a/xllm/core/kernels/mlu/fused_moe.cpp b/xllm/core/kernels/mlu/fused_moe.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlu_ops_api.h"
 #include "torch_mlu_ops.h"
-#include "torch_ops_api.h"
 
 namespace {
 torch::Tensor create_group_gemm_output(const torch::Tensor& a,
@@ -27,7 +27,7 @@ torch::Tensor create_group_gemm_output(const torch::Tensor& a,
 }
 }  // namespace
 
-namespace xllm::mlu {
+namespace xllm::kernel::mlu {
 torch::Tensor fused_moe(
     const torch::Tensor& hidden_states,
     const torch::Tensor& gating_output,
@@ -175,4 +175,4 @@ torch::Tensor fused_moe(
   return output.reshape(ori_input_shape);
 }
 
-}  // namespace xllm::mlu
+}  // namespace xllm::kernel::mlu
diff --git a/xllm/core/kernels/mlu/matmul.cpp b/xllm/core/kernels/mlu/matmul.cpp
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlu_ops_api.h"
 #include "torch_mlu_ops.h"
-#include "torch_ops_api.h"
 
-namespace xllm::mlu {
+namespace xllm::kernel::mlu {
 
-at::Tensor matmul(const at::Tensor& a,
-                  const at::Tensor& b,
-                  const std::optional<at::Tensor>& bias,
-                  const std::optional<at::Tensor>& c,
-                  double alpha,
-                  double beta) {
+torch::Tensor matmul(const torch::Tensor& a,
+                     const torch::Tensor& b,
+                     const std::optional<torch::Tensor>& bias,
+                     const std::optional<torch::Tensor>& c,
+                     double alpha,
+                     double beta) {
   return tmo::torch_api::matmul(a,
                                 b,
                                 bias,
@@ -43,4 +43,4 @@ at::Tensor matmul(const at::Tensor& a,
                                 true);
 }
 
-}  // namespace xllm::mlu
+}  // namespace xllm::kernel::mlu
-Original file line number
+Diff line change
@@ Expand Up / @@ -93,6 +93,7 @@ struct ModelInputParams { @@
         // Copy graph_buffer to device
         params.graph_buffer = safe_to(graph_buffer, device, true);
         return params;
       }
@@ Expand Down @@