Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/policies/updateStaleIssues.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Update Stale Issues
description: Update stale issues
resource: repository
configuration:
resourceManagementConfiguration:
scheduledSearches:
- description: Apply stale label to open, unassigned issues that have not been updated in the last 30 days
frequencies:
- daily:
time: 15:00
filters:
- isIssue
- isOpen
- isNotAssigned
- isNotLabeledWith:
label: contributions welcome
- isNotLabeledWith:
label: documentation
- isNotLabeledWith:
label: feature request
- isNotLabeledWith:
label: regression
- noActivitySince:
days: 30
actions:
- addReply:
reply: "Applying stale label due to no activity in 30 days"
- addLabel:
label: stale
- description: Close open, unassigned issues labeled stale that have not been updated in the last 30 days
frequencies:
- daily:
time: 15:00
filters:
- hasLabel:
label: stale
- isIssue
- isOpen
- isNotAssigned
- noActivitySince:
days: 30
actions:
- addReply:
reply: "Closing issue due to no activity in 30 days"
- closeIssue
eventResponderTasks:
- description: Remove stale label if open stale issue is commented on
if:
- payloadType: Issue_Comment
- hasLabel:
label: stale
then:
- removeLabel:
label: stale
- description: Re-open stale issue if closed stale issue is commented on
if:
- payloadType: Issue_Comment
- and:
- not:
isOpen
- hasLabel:
label: stale
then:
- reopenIssue

1 change: 1 addition & 0 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,7 @@ Do not modify directly.*
|||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|10+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
|RotaryEmbedding|*in* X:**T**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**M**<br> *out* Y:**T**|23+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
|Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(double), tensor(float), tensor(float16)|
|ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|Scan|*in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**<br><br>or<br><br>*in* sequence_lens:**I**<br> *in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**|19+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
Expand Down
4 changes: 4 additions & 0 deletions include/onnxruntime/core/framework/execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ class IExecutionProvider {
: default_device_(device), type_{type} {
}

IExecutionProvider(const std::string& type, OrtDevice device, const logging::Logger& logger)
: default_device_(device), type_{type}, logger_{&logger} {
}

/*
default device for this ExecutionProvider
*/
Expand Down
12 changes: 12 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -6074,6 +6074,18 @@ struct OrtApi {
* \since Version 1.23
*/
ORT_API2_STATUS(GetTensorData, _In_ const OrtValue* value, _Outptr_ const void** out);

/** \brief Get Session configuration entries.
*
* \param[in] options The session options.
* \param[out] out A pointer to a newly created OrtKeyValuePairs instance.
*
* An OrtKeyValuePairs instance containing all session configuration entries.
* Note: the user should call OrtApi::ReleaseKeyValuePairs.
*
* \since Version 1.23.
*/
ORT_API2_STATUS(GetSessionOptionsConfigEntries, _In_ const OrtSessionOptions* options, _Outptr_ OrtKeyValuePairs** out);
};

/*
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
const Tensor* sin_cache = context->Input<Tensor>(8);
const Tensor* position_ids = context->Input<Tensor>(9);
const Tensor* attention_bias = context->Input<Tensor>(10);
const Tensor* head_sink = context->Input<Tensor>(11);

GroupQueryAttentionParameters parameters = {};
ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckInputs(query,
Expand All @@ -73,6 +74,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {

ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckCustomAttentionInputs(position_ids,
attention_bias,
head_sink,
parameters));

const int batch_size = parameters.batch_size;
Expand Down
18 changes: 18 additions & 0 deletions onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ Status CheckInputs(const T* query,
template <typename T = Tensor>
Status CheckCustomAttentionInputs(const T* position_ids,
const T* attention_bias,
const T* head_sink,
const GroupQueryAttentionParameters& parameters) {
if (position_ids != nullptr) {
const auto& pos_ids_shape = position_ids->Shape();
Expand Down Expand Up @@ -377,6 +378,23 @@ Status CheckCustomAttentionInputs(const T* position_ids,
}
}

if (head_sink != nullptr) {
if (parameters.use_smooth_softmax) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"head_sink should not be provided when use_smooth_softmax is true.");
}

const auto& head_sink_shape = head_sink->Shape();
if (head_sink_shape.NumDimensions() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "head_sink must be a 1D tensor");
}

if (head_sink_shape[0] != parameters.num_heads) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"head_sink dimension 0 must be equal to the num heads, got ", head_sink_shape[0]);
}
}

return Status::OK();
}

Expand Down
29 changes: 13 additions & 16 deletions onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <algorithm>
#include <cfloat>
#include <cuda.h>
#include <cuda.h> // for CUDA_VERSION

Check notice on line 21 in onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu#L21

Found C system header after C++ system header. Should be: moe_kernel.h, c system, c++ system, other. [build/include_order] [4]
Raw output
onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu:21:  Found C system header after C++ system header. Should be: moe_kernel.h, c system, c++ system, other.  [build/include_order] [4]
#include <cuda_fp16.h>
#include <math.h>
#include <sstream>
Expand All @@ -38,19 +38,12 @@

#include "moe_kernel.h"

#if CUDA_VERSION >= 11000
#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cub/util_type.cuh>
#else
#include "cub/cub.cuh"
#include "cub/device/device_radix_sort.cuh"
#include "cub/util_type.cuh"
#endif

namespace ort_fastertransformer {
static constexpr int WARP_SIZE = 32;

// ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing the output
// in the softmax kernel when we extend this module to support expert-choice routing.
Expand All @@ -65,13 +58,6 @@

const int thread_row_offset = blockIdx.x * num_cols;

#if CUDA_VERSION >= 12090
::cuda::std::plus sum;
#else
// Deprecated on CUDA 12.9
cub::Sum sum;
#endif

float threadData(-FLT_MAX);

// Don't touch finished rows.
Expand All @@ -84,7 +70,12 @@
threadData = max(static_cast<float>(input[idx]), threadData);
}

#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::maximum());
#else
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
#endif

if (threadIdx.x == 0) {
float_max = maxElem;
}
Expand All @@ -97,7 +88,12 @@
threadData += exp((static_cast<float>(input[idx]) - float_max));
}

const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::std::plus());
#else
// Deprecated on CUDA 12.9
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, cub::Sum());
#endif

if (threadIdx.x == 0) {
normalizing_factor = 1.f / Z;
Expand Down Expand Up @@ -993,6 +989,7 @@
if (experts_start_index > 0) {
total_past_rows = total_rows_before_expert_host_[experts_start_index - 1];
}

total_covered_rows = total_rows_before_expert_host_[experts_end_index] - total_past_rows;
}

Expand Down
62 changes: 43 additions & 19 deletions onnxruntime/contrib_ops/webgpu/bert/attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_h
return context.RunProgram(program);
};

void InitVarStub(std::ostringstream& ss, const Tensor* seqlen_k) {
if (seqlen_k != nullptr) {
void InitVarStub(std::ostringstream& ss, bool has_seqlen_k) {
if (has_seqlen_k) {
ss << "total_sequence_length = u32(seqlen_k[batch_idx]) + 1;\n";
ss << "var past_sequence_length: u32 = select(total_sequence_length - sequence_length, 0u, uniforms.is_first_prompt > 0);\n";
} else {
Expand All @@ -87,7 +87,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
if (has_attention_bias_) {
shader.AddInput("attention_bias", ShaderUsage::UseUniform);
}
if (seqlen_k_ != nullptr) {
if (has_seqlen_k_) {
shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
}
shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
Expand All @@ -107,7 +107,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
<< "let sequence_length = uniforms.M;\n"
<< "var total_sequence_length = uniforms.N;\n";
std::ostringstream oss;
InitVarStub(oss, seqlen_k_);
InitVarStub(oss, has_seqlen_k_);
shader.MainFunctionBody() << oss.str();
shader.MainFunctionBody() << "let kOffset = (batch_head_idx / uniforms.n_reps) * uniforms.kv_sequence_length * uniforms.K;\n";
if (has_present_key_) {
Expand Down Expand Up @@ -182,7 +182,7 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);

AttentionProbsProgram program{"AttentionProbs", feed_past_key, has_present_key, has_attention_bias, tile_size,
components, parameters.is_first_prompt_, seqlen_k, parameters.past_present_share_buffer_};
components, parameters.is_first_prompt_, seqlen_k != nullptr, parameters.past_present_share_buffer_};
program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
{K, ProgramTensorMetadataDependency::TypeAndRank, components}});
if (feed_past_key) {
Expand Down Expand Up @@ -224,30 +224,44 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
}

Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
if (seqlen_k_) {
if (has_seqlen_k_) {
shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
}
if (has_head_sink_) {
shader.AddInput("head_sink", ShaderUsage::UseUniform);
}
shader.AddOutput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
shader.AdditionalImplementation() << "var<workgroup> thread_max: array<f32, " << work_group_size_ << ">;\n"
<< "var<workgroup> thread_sum: array<f32, " << work_group_size_ << ">;\n"
<< "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
shader.MainFunctionBody() << "let sequence_length = uniforms.sequence_length;\n"
<< "let batch_idx = u32(workgroup_idx / sequence_length) / uniforms.num_heads;\n"
<< "let head_idx = u32(workgroup_idx / sequence_length) % uniforms.num_heads;\n"
<< "var total_sequence_length = uniforms.total_sequence_length_comp * " << components_ << ";\n";
std::ostringstream oss;
InitVarStub(oss, seqlen_k_);
InitVarStub(oss, has_seqlen_k_);
shader.MainFunctionBody() << oss.str()
<< "let local_offset = local_idx * uniforms.elements_per_thread;\n"
<< "let offset = workgroup_idx * uniforms.total_sequence_length_comp + local_offset;\n"
<< "let seq_causal_length = " << (seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
<< "let seq_causal_length = " << (has_seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
<< "var thread_max_vector = f32_val_t(-3.402823e+38f);\n"
<< "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
<< " thread_max_vector = max(f32_val_t(x[offset + i]), thread_max_vector);\n"
<< "}\n"
<< "thread_max[local_idx] = " << (components_ == 4 ? "max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))" : (components_ == 2 ? "max(thread_max_vector.x, thread_max_vector.y)" : "thread_max_vector")) << ";\n"
<< "workgroupBarrier();\n"
<< "var max_value = f32(-3.402823e+38f);\n"
<< "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
<< "workgroupBarrier();\n";

if (has_head_sink_) {
// Handle head sink
shader.MainFunctionBody() << "let sink_value: f32 = head_sink[head_idx];\n"
<< "var max_value = sink_value;\n";
} else if (use_smooth_softmax_) {
shader.MainFunctionBody() << "var max_value: f32 = 0.0;\n";
} else {
shader.MainFunctionBody() << "var max_value = f32(-3.402823e+38f);\n";
}

shader.MainFunctionBody() << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
<< " max_value = max(thread_max[i], max_value);\n"
<< "}\n"
<< "var sum_vector = f32_val_t(0);\n"
Expand All @@ -259,8 +273,15 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
<< "var sum: f32 = 0;\n"
<< "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
<< " sum += thread_sum[i]\n;"
<< "}\n"
<< "if (sum == 0) {\n"
<< "}\n";

if (has_head_sink_) {
shader.MainFunctionBody() << "sum += exp(sink_value - max_value);\n";
} else if (use_smooth_softmax_) {
shader.MainFunctionBody() << "sum += exp(-max_value);\n";
}

shader.MainFunctionBody() << "if (sum == 0) {\n"
<< " for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
<< " x[offset + i] = x_value_t(x_element_t(1.0)/x_element_t(seq_causal_length));\n"
<< " }\n"
Expand All @@ -270,7 +291,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
<< " x[offset + i] = x_value_t(exp(f32input - max_value) / sum);\n"
<< " }\n"
<< "}\n";
if (seqlen_k_) {
if (has_seqlen_k_) {
shader.MainFunctionBody() << "for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length_comp; total_seq_id++) {\n"
<< " x[offset + total_seq_id] = x_value_t(x_element_t(0));\n"
<< "}\n";
Expand All @@ -280,7 +301,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
}

Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tensor* probs, int32_t batch_size, int32_t num_heads, int32_t past_sequence_length, int32_t sequence_length, int32_t total_sequence_length,
const Tensor* seqlen_k, bool is_first_prompt) {
const Tensor* seqlen_k, bool is_first_prompt, bool use_smooth_softmax, const Tensor* head_sink) {
const int components = seqlen_k != nullptr ? 1 : (total_sequence_length % 4 == 0 ? 4 : (total_sequence_length % 2 == 0 ? 2 : 1));
int work_group_size = 64;
const int total_sequence_length_comp = (total_sequence_length + components - 1) / components;
Expand All @@ -289,12 +310,15 @@ Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tenso
}
const int elementsPerThread = (total_sequence_length_comp + work_group_size - 1) / work_group_size;

InPlaceSoftmaxProgram program{"InPlaceSoftmax", work_group_size, components, seqlen_k};
InPlaceSoftmaxProgram program{work_group_size, components, use_smooth_softmax, seqlen_k != nullptr, head_sink != nullptr};
if (seqlen_k != nullptr) {
program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
}
if (head_sink != nullptr) {
program.AddInput({head_sink, ProgramTensorMetadataDependency::Type});
}
program.AddOutputs({{probs, ProgramTensorMetadataDependency::TypeAndRank, components}})
.CacheHint(work_group_size)
.CacheHint(work_group_size, use_smooth_softmax)
.SetDispatchGroupSize(batch_size * num_heads * sequence_length)
.SetWorkgroupSize(work_group_size)
.AddUniformVariables({{static_cast<uint32_t>(batch_size)},
Expand Down Expand Up @@ -443,7 +467,7 @@ Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int

Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* head_sink, const Tensor* seqlen_k) {
const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length_ : 0;
const int total_sequence_length =
Expand All @@ -457,7 +481,7 @@ Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const T
parameters, past_sequence_length, total_sequence_length, seqlen_k));

ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_));
parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_, parameters.use_smooth_softmax_, head_sink));

ORT_RETURN_IF_ERROR(ComputeVxAttentionScore(context, output_count, &probs, V, past_value, output, present_value,
parameters, past_sequence_length, total_sequence_length, seqlen_k));
Expand Down
Loading
Loading