Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions xllm/core/common/global_flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ DEFINE_string(store_metadata_connstring,
"",
"The address of the kv cache store metadata service.");

// --- for computation communication parallel ---
// --- computation communication parallel config ---

DEFINE_bool(
enable_multi_stream_parallel,
Expand All @@ -355,7 +355,7 @@ DEFINE_int32(default_micro_batch_num,
2,
"Default use two micro batches for multi-stream parallel.");

// --- for dit ---
// --- dit config ---
DEFINE_int32(max_requests_per_batch, 1, "Max number of request per batch.");

// --- continuous kv cache config ---
Expand All @@ -377,4 +377,4 @@ DEFINE_int64(cache_size_per_token,

DEFINE_int64(buffer_size_per_seq,
0,
"Buffer size per sequence in bytes, default 0.");
"Buffer size per sequence in bytes, default 0.");
3 changes: 1 addition & 2 deletions xllm/core/common/global_flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ DECLARE_int32(max_global_ttft_ms);

DECLARE_int32(max_global_tpot_ms);

// dit
DECLARE_int32(max_requests_per_batch);

DECLARE_bool(enable_continuous_kvcache);
Expand All @@ -198,4 +197,4 @@ DECLARE_int64(granularity_size);

DECLARE_int64(cache_size_per_token);

DECLARE_int64(buffer_size_per_seq);
DECLARE_int64(buffer_size_per_seq);
1 change: 0 additions & 1 deletion xllm/core/framework/batch/batch_input_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ class BatchInputBuilder {
uint32_t q_seq_len,
BuilderState* state_ptr = nullptr,
std::unordered_set<int32_t>* write_block_ids_ptr = nullptr);

void setup_continuous_kv_cache_info(Sequence* sequence,
uint32_t n_kv_cache_tokens,
uint32_t seq_len,
Expand Down
1 change: 1 addition & 0 deletions xllm/core/framework/model/model_input_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ struct ModelInputParams {

// Copy graph_buffer to device
params.graph_buffer = safe_to(graph_buffer, device, true);

return params;
}

Expand Down
18 changes: 15 additions & 3 deletions xllm/core/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
include(cc_library)

if(USE_NPU)
include_directories(
${CMAKE_SOURCE_DIR}/third_party/spdlog/include
)
add_subdirectory(npu)
endif()

if(USE_MLU)
add_subdirectory(mlu)
endif()


cc_library(
NAME
kernels
HDRS
param.h
ops_api.h
SRCS
ops_api.cpp
DEPS
torch
$<$<BOOL:${USE_NPU}>:npu_kernels>
$<$<BOOL:${USE_MLU}>:mlu_kernels>
)
3 changes: 1 addition & 2 deletions xllm/core/kernels/mlu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ include(cc_library)

file(GLOB_RECURSE MLU_HEADER_FILES
"${CMAKE_CURRENT_LIST_DIR}/*.h"
"${CMAKE_CURRENT_LIST_DIR}/*.hpp"
)

file(GLOB_RECURSE MLU_SOURCE_FILES
Expand All @@ -11,7 +10,7 @@ file(GLOB_RECURSE MLU_SOURCE_FILES

cc_library(
NAME
xllm_mlu_ops
mlu_kernels
HDRS
${MLU_HEADER_FILES}
SRCS
Expand Down
38 changes: 38 additions & 0 deletions xllm/core/kernels/mlu/active.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mlu_ops_api.h"
#include "torch_mlu_ops.h"

namespace xllm::kernel::mlu {

void active(const torch::Tensor& input,
torch::Tensor& output,
const std::optional<torch::Tensor>& bias,
const std::optional<torch::Tensor>& cusum_token_count,
const std::string& act_mode,
bool is_gated,
int start_expert_id,
int expert_size) {
tmo::torch_api::active(input,
output,
bias,
cusum_token_count,
act_mode,
is_gated,
start_expert_id,
expert_size);
}
} // namespace xllm::kernel::mlu
119 changes: 119 additions & 0 deletions xllm/core/kernels/mlu/attention.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mlu_ops_api.h"
#include "torch_mlu_ops.h"

namespace xllm::kernel::mlu {

void reshape_paged_cache(torch::Tensor& key,
torch::Tensor& value,
torch::Tensor& k_cache,
torch::Tensor& v_cache,
const torch::Tensor& slot_mapping,
bool direction) {
tmo::torch_api::reshape_paged_cache(
key, value, k_cache, v_cache, slot_mapping, direction);
}

void batch_prefill(const torch::Tensor& query,
const torch::Tensor& key,
const torch::Tensor& value,
torch::Tensor& output,
std::optional<torch::Tensor>& output_lse,
const std::optional<torch::Tensor>& query_start_loc,
const std::optional<torch::Tensor>& seq_start_loc,
const std::optional<torch::Tensor>& alibi_slope,
const std::optional<torch::Tensor>& attn_bias,
const std::optional<torch::Tensor>& q_quant_scale,
const std::optional<torch::Tensor>& k_quant_scale,
const std::optional<torch::Tensor>& v_quant_scale,
const std::optional<torch::Tensor>& out_quant_scale,
const std::optional<torch::Tensor>& block_table,
int max_query_len,
int max_seq_len,
float scale,
bool is_causal,
int window_size_left,
int window_size_right,
const std::string& compute_dtype,
bool return_lse) {
tmo::torch_api::flash_attention(query,
key,
value,
output,
output_lse,
query_start_loc,
seq_start_loc,
alibi_slope,
attn_bias,
q_quant_scale,
k_quant_scale,
v_quant_scale,
out_quant_scale,
block_table,
max_query_len,
max_seq_len,
scale,
is_causal,
window_size_left,
window_size_right,
compute_dtype,
return_lse);
}

void batch_decode(const torch::Tensor& query,
const torch::Tensor& k_cache,
torch::Tensor& output,
const torch::Tensor& block_table,
const torch::Tensor& seq_lens,
const torch::Tensor& v_cache,
std::optional<torch::Tensor>& output_lse,
const std::optional<torch::Tensor>& q_quant_scale,
const std::optional<torch::Tensor>& k_cache_quant_scale,
const std::optional<torch::Tensor>& v_cache_quant_scale,
const std::optional<torch::Tensor>& out_quant_scale,
const std::optional<torch::Tensor>& alibi_slope,
const std::optional<torch::Tensor>& mask,
const std::string& compute_dtype,
int max_seq_len,
int window_size_left,
int window_size_right,
float scale,
bool return_lse,
int kv_cache_quant_bit_size) {
tmo::torch_api::single_query_cached_kv_attn(query,
k_cache,
output,
block_table,
seq_lens,
v_cache,
output_lse,
q_quant_scale,
k_cache_quant_scale,
v_cache_quant_scale,
out_quant_scale,
alibi_slope,
mask,
compute_dtype,
max_seq_len,
window_size_left,
window_size_right,
scale,
return_lse,
kv_cache_quant_bit_size);
}

} // namespace xllm::kernel::mlu
53 changes: 53 additions & 0 deletions xllm/core/kernels/mlu/fused_layernorm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mlu_ops_api.h"
#include "torch_mlu_ops.h"

namespace xllm::kernel::mlu {

void fused_layernorm(const torch::Tensor& input,
torch::Tensor& output,
const std::optional<torch::Tensor>& residual,
const torch::Tensor& weight,
const std::optional<torch::Tensor>& beta,
const std::optional<torch::Tensor>& bias,
const std::optional<torch::Tensor>& quant_scale,
const std::optional<torch::Tensor>& residual_out,
const std::optional<torch::Tensor>& smooth_quant_scale,
const std::optional<torch::Tensor>& normed_out,
const std::string& mode,
double eps,
bool store_output_before_norm,
bool store_output_after_norm,
bool dynamic_quant) {
tmo::torch_api::fused_layernorm(input,
output,
residual,
weight,
beta,
bias,
quant_scale,
residual_out,
smooth_quant_scale,
normed_out,
mode,
eps,
store_output_before_norm,
store_output_after_norm,
dynamic_quant);
}

} // namespace xllm::kernel::mlu
6 changes: 3 additions & 3 deletions xllm/core/kernels/mlu/fused_moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mlu_ops_api.h"
#include "torch_mlu_ops.h"
#include "torch_ops_api.h"

namespace {
torch::Tensor create_group_gemm_output(const torch::Tensor& a,
Expand All @@ -27,7 +27,7 @@ torch::Tensor create_group_gemm_output(const torch::Tensor& a,
}
} // namespace

namespace xllm::mlu {
namespace xllm::kernel::mlu {
torch::Tensor fused_moe(
const torch::Tensor& hidden_states,
const torch::Tensor& gating_output,
Expand Down Expand Up @@ -175,4 +175,4 @@ torch::Tensor fused_moe(
return output.reshape(ori_input_shape);
}

} // namespace xllm::mlu
} // namespace xllm::kernel::mlu
18 changes: 9 additions & 9 deletions xllm/core/kernels/mlu/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mlu_ops_api.h"
#include "torch_mlu_ops.h"
#include "torch_ops_api.h"

namespace xllm::mlu {
namespace xllm::kernel::mlu {

at::Tensor matmul(const at::Tensor& a,
const at::Tensor& b,
const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& c,
double alpha,
double beta) {
torch::Tensor matmul(const torch::Tensor& a,
const torch::Tensor& b,
const std::optional<torch::Tensor>& bias,
const std::optional<torch::Tensor>& c,
double alpha,
double beta) {
return tmo::torch_api::matmul(a,
b,
bias,
Expand All @@ -43,4 +43,4 @@ at::Tensor matmul(const at::Tensor& a,
true);
}

} // namespace xllm::mlu
} // namespace xllm::kernel::mlu
Loading