From a2e46f4a20e975d29a960712f6388c8671622140 Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Fri, 10 Oct 2025 00:25:09 -0700 Subject: [PATCH 01/19] fix gather_nd on webgpu ep (#26270) fixes gather_nd on webgpu ep (found by transformers.js for the vision encoder of docling) --- onnxruntime/core/providers/webgpu/tensor/gather_nd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc index 7c3aced3f0295..cab1dc03848b9 100644 --- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc @@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const { data_dim += indices_innerest_dim_; for (uint32_t i = 0; i < static_cast(data.Rank() - data_dim); i++) { - shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; + shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; } shader.MainFunctionBody() << " " << output.SetByOffset("global_idx", data.GetByIndices("data_indices")); From 8d4c5baf52f0cd974c470da8eb3a7cf64e072cb8 Mon Sep 17 00:00:00 2001 From: qti-hungjuiw Date: Fri, 10 Oct 2025 23:28:51 +0800 Subject: [PATCH 02/19] Make local mirror of cmake dependencies configurable (#26042) ### Description - Added support for the `--cmake_deps_mirror_dir` option to allow users to specify a custom local directory for CMake dependencies. - Improved logging to show the source of `FetchContent` in CMake. ### Motivation and Context - Previously, ONNX Runtime searched for CMake dependencies only in the default `/mirror` directory. - This change enables users to configure an alternative location for storing CMake dependencies, offering greater flexibility in build environments. --- cmake/CMakeLists.txt | 7 +++++-- cmake/external/helper_functions.cmake | 6 +++--- cmake/external/onnxruntime_external_deps.cmake | 2 +- tools/ci_build/build.py | 3 +++ tools/ci_build/build_args.py | 1 + 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 116d369885a27..8186da507a442 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS) endif() -# Single output director for all binaries +# Single output directory for all binaries set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") +# Local mirror directory of cmake dependencies +set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) +set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies") + include(FetchContent) @@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD) set(onnxruntime_MINIMAL_BUILD ON) endif() -set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime) set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining) set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining) diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake index 55059b9500a8e..e8044411e4201 100644 --- a/cmake/external/helper_functions.cmake +++ b/cmake/external/helper_functions.cmake @@ -4,11 +4,11 @@ # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects. function(onnxruntime_fetchcontent_declare contentName) + cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "") + message(STATUS "Fetch ${contentName} from ${ARG_URL}") FetchContent_Declare(${ARGV}) string(TOLOWER ${contentName} contentNameLower) - list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR) - if(index_SOURCE_SUBDIR GREATER_EQUAL 0) - cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") + if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "") set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE) endif() endfunction() diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 8e1a880579b34..f32350ca755ea 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST) if(ONNXRUNTIME_DEP_URL MATCHES "^https://") # Search a local mirror folder - string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") + string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") if(EXISTS "${LOCAL_URL}") cmake_path(ABSOLUTE_PATH LOCAL_URL) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 327caf83c7850..54dd23b07a363 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1015,6 +1015,9 @@ def generate_build_tree( if path_to_protoc_exe: cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"] + if args.cmake_deps_mirror_dir: + cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"] + if args.fuzz_testing: if not ( args.build_shared_lib diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index c5454903474d1..05d5052067b2e 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None: help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines", ) parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.") + parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.") parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.") parser.add_argument( "--enable_symbolic_shape_infer_tests", From ac7f4b7763289bfb82fc3f5165d82895d6adde42 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Sat, 11 Oct 2025 02:37:00 +0800 Subject: [PATCH 03/19] [WebNN] Remove constraints for Gemm's C input (#26273) Now WebNN implementation for gemm's C operand has supported unidirectional broadcasting, which is align with ONNX spec. Removing constraints for Gemm's C input as which should be covered in ORT kernel. --- js/web/docs/webnn-operators.md | 2 +- .../webnn/builders/impl/gemm_op_builder.cc | 23 ------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 295aacc6fffa3..ea88f291e5597 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s | GatherElements | ai.onnx(11-12, 13+) | gatherElements | | | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 | | Gelu | ai.onnx(20+) | gelu | | -| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input | +| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | | | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input | | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input | | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 | diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 0ea927967d989..5a80f01c17236 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&, std::vector c_shape; if (!GetShape(*input_defs[c_idx], c_shape, logger)) return false; - - size_t c_dim = c_shape.size(); - - if (c_dim > 1) { - // TODO: Supports other shape of C. - // Currently WebNN implementation in Chromium only supports 1-D C. - return false; - } - if (c_dim == 0) { - LOGS(logger, VERBOSE) << "C of Gemm is a scalar"; - } else { - auto c_size = c_shape[c_dim - 1]; - NodeAttrHelper helper(node); - const auto transB = helper.Get("transB", 0); - if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) { - LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape[" - << (transB == 0 ? "1" : "0") << "]" - << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]" - << " c_size: " << c_size; - - return false; - } - } } } From 1442fe00d9169e89ec4a52bd699952810244ba6b Mon Sep 17 00:00:00 2001 From: Xiaofei Han Date: Sat, 11 Oct 2025 11:40:56 +0800 Subject: [PATCH 04/19] fix np.testing argument order (#26128) ### Description The argument order of np.testing was incorrect. ### Motivation and Context Before, the expected result and the actual result are reversed. image --- .../python/tools/tensorrt/perf/benchmark.py | 2 +- .../test/python/onnx_backend_test_series.py | 8 ++-- .../test/python/onnxruntime_test_python.py | 40 +++++++++---------- .../python/onnxruntime_test_python_autoep.py | 16 ++++---- .../python/onnxruntime_test_python_backend.py | 2 +- .../onnxruntime_test_python_backend_mlops.py | 8 ++-- .../onnxruntime_test_python_cudagraph.py | 20 +++++----- .../onnxruntime_test_python_dmlgraph.py | 18 ++++----- .../python/onnxruntime_test_python_mlops.py | 14 +++---- ...me_test_python_nv_tensorrt_rtx_ep_tests.py | 6 +-- .../test/python/quantization/test_fusions.py | 2 +- .../quantization/test_qdq_loss_debug.py | 2 +- .../test_quantizeblockwise_bnb4.py | 4 +- .../custom_op_test_local_function.py | 2 +- 14 files changed, 72 insertions(+), 72 deletions(-) diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index d6b39a6b2aeb4..66ab0c44f8814 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch): for ref_o, o in zip(ref_output, output, strict=False): # abs(desired-actual) < rtol * abs(desired) + atol try: - np.testing.assert_allclose(ref_o, o, rtol, atol) + np.testing.assert_allclose(o, ref_o, rtol, atol) except Exception as e: if percentage_in_allowed_threshold(e, percent_mismatch): continue diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index 72c6a5664f395..d2e9557f633b0 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None """ def assert_similar_array(ref_output, output): - np.testing.assert_equal(ref_output.dtype, output.dtype) + np.testing.assert_equal(output.dtype, ref_output.dtype) if ref_output.dtype == object: - np.testing.assert_array_equal(ref_output, output) + np.testing.assert_array_equal(output, ref_output) else: - np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol) + np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol) - np.testing.assert_equal(len(ref_outputs), len(outputs)) + np.testing.assert_equal(len(outputs), len(ref_outputs)) for i in range(len(outputs)): # pylint: disable=consider-using-enumerate if isinstance(outputs[i], list): for j in range(len(outputs[i])): diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index e44adcdb9827f..7f003453add89 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -54,7 +54,7 @@ def run_model(self, session_object, run_options): input_name = session_object.get_inputs()[0].name res = session_object.run([], {input_name: x}, run_options=run_options) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue): for _ in range(iter_num): @@ -714,7 +714,7 @@ def test_run_model(self): res = sess.run([outputs[0].name], {inputs[0].name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_async(self): event = threading.Event() @@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None: self.assertEqual(len(err), 0) self.assertEqual(len(res), 1) self.assertEqual(data.get_id(), 123456) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) event.set() so = onnxrt.SessionOptions() @@ -762,7 +762,7 @@ def test_run_model_from_bytes(self): self.assertEqual(output_shape, [3, 2]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -777,7 +777,7 @@ def test_run_model2(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2_contiguous(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -792,10 +792,10 @@ def test_run_model2_contiguous(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xcontiguous = np.ascontiguousarray(x) rescontiguous = sess.run([output_name], {input_name: xcontiguous}) - np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_multiple_threads(self): # Skip this test for a "pure" DML onnxruntime python wheel. @@ -860,14 +860,14 @@ def test_list_as_input(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x.tolist()}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_string_list_as_input(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2)) x_name = sess.get_inputs()[0].name res = sess.run([], {x_name: x.tolist()}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_run_device(self): device = onnxrt.get_device() @@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self): self.assertEqual(output_shape, ["None", 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_boolean_inputs(self): sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers) @@ -920,7 +920,7 @@ def test_boolean_inputs(self): output_expected = np.array([[True, False], [False, False]], dtype=bool) res = sess.run([output_name], {a_name: a, b_name: b}) - np.testing.assert_equal(output_expected, res[0]) + np.testing.assert_equal(res[0], output_expected) def test_string_input1(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -941,7 +941,7 @@ def test_string_input1(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_string_input2(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -962,7 +962,7 @@ def test_string_input2(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_bytes(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -983,7 +983,7 @@ def test_input_bytes(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0].astype("|S8")) + np.testing.assert_equal(res[0].astype("|S8"), x) def test_input_object(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1004,7 +1004,7 @@ def test_input_object(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_void(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1029,7 +1029,7 @@ def test_input_void(self): res = sess.run([output_name], {x_name: x}) expr = np.array([["must", "have"], ["same", "size"]], dtype=object) - np.testing.assert_equal(expr, res[0]) + np.testing.assert_equal(res[0], expr) def test_raise_wrong_num_inputs(self): with self.assertRaises(ValueError) as context: @@ -1164,7 +1164,7 @@ def test_sequence_construct(self): }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_sequence_insert(self): opt = onnxrt.SessionOptions() @@ -1194,7 +1194,7 @@ def test_sequence_insert(self): "input_seq": [], }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_ort_execution_mode(self): opt = onnxrt.SessionOptions() @@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self): input_1 = np.zeros((3, 5)).astype(np.float32) res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1}) output_expected = np.ones((3, 5)).astype(np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Create an alias of SessionOptions instance # We will use this alias to construct another InferenceSession @@ -1969,7 +1969,7 @@ def test_adater_export_read(self): self.assertTrue(value.is_tensor()) self.assertEqual(expected_val.element_type(), value.element_type()) self.assertEqual(expected_val.shape(), value.shape()) - np.testing.assert_allclose(expected_val.numpy(), value.numpy()) + np.testing.assert_allclose(value.numpy(), expected_val.numpy()) def test_run_with_adapter(self): model_path = get_name("lora/two_params_lora_model.onnx") diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py index d66951bd66f3d..a24269a312e9b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py +++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py @@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -146,7 +146,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self): gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57) # copy back to CPU cpu_data = gpu_value.numpy() - np.testing.assert_equal(data, cpu_data) + np.testing.assert_equal(cpu_data, data) gpu_value.update_inplace(data2) # update the fake GPU data cpu_data_2 = gpu_value.numpy() # copy back to CPU - np.testing.assert_equal(data2, cpu_data_2) + np.testing.assert_equal(cpu_data_2, data2) gpu_value = None # Delete OrtValue before unregistering library as the allocator will be destroyed. @@ -336,8 +336,8 @@ def test_copy_tensors(self): del b_device # Verify the contents - np.testing.assert_array_equal(a, a_cpu_copy.numpy()) - np.testing.assert_array_equal(b, b_cpu_copy.numpy()) + np.testing.assert_array_equal(a_cpu_copy.numpy(), a) + np.testing.assert_array_equal(b_cpu_copy.numpy(), b) self.unregister_execution_provider_library(ep_name) diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py index 6ed7dfe59b1f6..416d9b6edecd1 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py @@ -19,7 +19,7 @@ def test_run_model(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self): """ diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py index c245699e211d4..9e3c1acbc923b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py @@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows): for i in range(num_rows): # use np.testing.assert_allclose so we can specify the tolerance np.testing.assert_allclose( - [expected_rows[i][key] for key in sorted_keys], [actual_rows[i][key] for key in sorted_keys], + [expected_rows[i][key] for key in sorted_keys], rtol=1e-05, atol=1e-07, ) @@ -37,7 +37,7 @@ def test_run_model_non_tensor(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = rep.run(x) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_proto(self): name = datasets.get_example("logreg_iris.onnx") @@ -47,7 +47,7 @@ def test_run_model_proto(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { @@ -72,7 +72,7 @@ def test_run_model_proto_api(self): outputs = ort_backend.run_model(model, inputs) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py index 5ab2fe8939f6a..d6c1dd9cff3f3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "CUDAExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_cuda_graph(self): if "TensorrtExecutionProvider" in onnxrt.get_available_providers(): @@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers): # One regular run for the necessary memory allocation and cuda graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, CUDA graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay CUDA graph x_ortvalue.update_inplace( @@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) def test_arena_with_cuda_graph(self): if "CUDAExecutionProvider" in onnxrt.get_available_providers(): @@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self): session.run_with_iobinding(io_binding) output = cuda_graph_helper.get_output("softmaxout_1") - np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py index 033eae1cb4c8d..4a6aa7b63d9c3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "DmlExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_dml_graph(self): if "DmlExecutionProvider" in onnxrt.get_available_providers(): @@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers): # One regular run for the necessary memory allocation and dml graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, DML graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay DML graph x_ortvalue.update_inplace( @@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py index 8b6b029c57752..70b8c0fc0b980 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py @@ -80,7 +80,7 @@ def test_dict_vectorizer(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xwrong = x.copy() xwrong["a"] = 5.6 @@ -96,17 +96,17 @@ def test_dict_vectorizer(self): x = {np.int64(k): np.float32(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int64(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int32(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_label_encoder(self): sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers()) @@ -127,18 +127,18 @@ def test_label_encoder(self): x = np.array([["4"]]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[3]], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Python type x = np.array(["4"], ndmin=2) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = np.array(["4"], ndmin=2, dtype=object) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_mlnet(self): available_providers = onnxrt.get_available_providers() diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py index d5c80a4a1f4ba..034f0288e2508 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py +++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py @@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): """ @@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self): """ @@ -152,7 +152,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_bind_input_only(self): """ diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py index bea110e566fb9..f02f4da4eb0fb 100644 --- a/onnxruntime/test/python/quantization/test_fusions.py +++ b/onnxruntime/test/python/quantization/test_fusions.py @@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e for idx, expected_output in enumerate(orig_results): actual_output = fused_results[idx] np.testing.assert_allclose( - expected_output, actual_output, + expected_output, rtol=rtol, atol=atol, err_msg=f"Fused model output {idx} differs", diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py index 5d70641547eae..20b40fc157c16 100644 --- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py +++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py @@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self): for expected, actual in zip(model_outputs, test_outputs, strict=False): exp = expected.reshape(-1) act = actual.reshape(-1) - np.testing.assert_equal(exp, act) + np.testing.assert_equal(act, exp) def test_create_activation_matching_present(self): float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx") diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py index a8f7591186766..906bf7aab8698 100644 --- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py +++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py @@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self): matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type) quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type) quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type) - np.testing.assert_allclose(quant_value_ref, quant_value) - np.testing.assert_allclose(absmax_ref, absmax) + np.testing.assert_allclose(quant_value, quant_value_ref) + np.testing.assert_allclose(absmax, absmax_ref) if __name__ == "__main__": diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py index 7916d93c3e531..1dedc475c9962 100644 --- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py +++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py @@ -40,7 +40,7 @@ def test_basic_all(self): x = np.arange(2**2).reshape((2,) * 2).astype(np.float32) t = np.arange(8).reshape((2, 4)).astype(np.float32) got = sess.run(None, {"X": x})[0] - np.testing.assert_allclose(t, got, atol=1e-5) + np.testing.assert_allclose(got, t, atol=1e-5) if __name__ == "__main__": From dde2fefe9deff01c3f25a57f1a027443af68f734 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Oct 2025 10:16:52 +0000 Subject: [PATCH 05/19] Bump vite from 6.3.5 to 6.3.6 in /js/web/test/e2e/exports/testcases/vite-default (#26000) --- .../e2e/exports/testcases/vite-default/package-lock.json | 8 ++++---- .../test/e2e/exports/testcases/vite-default/package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json index 48f0a8f3e9d5c..e880f6bca2ac4 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json @@ -12,7 +12,7 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } }, "node_modules/@babel/helper-string-parser": { @@ -1114,9 +1114,9 @@ } }, "node_modules/vite": { - "version": "6.3.5", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz", - "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==", + "version": "6.3.6", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz", + "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==", "dev": true, "license": "MIT", "dependencies": { diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json index f7d5751354905..84013e2aecb88 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package.json @@ -13,6 +13,6 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } } From 41b238edee2cd78f27ef5e5a67b3d380b2c1d2e5 Mon Sep 17 00:00:00 2001 From: David Fan <30608893+jiafatom@users.noreply.github.com> Date: Mon, 13 Oct 2025 10:04:15 -0700 Subject: [PATCH 06/19] Fix Memory Issue sparse_attention Rotary (#26278) ### Description From an internal user, we see that sparse attention has similar memory issue of https://github.com/microsoft/onnxruntime/pull/22290/ So we follow that PR to make the change. ### Motivation and Context SparseAttention memory issue. --- .../contrib_ops/cpu/sparse/sparse_attention.cc | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc index 469084e7b4491..c51fc1cf54815 100644 --- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc +++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc @@ -130,6 +130,11 @@ Status SparseAttention::Compute(OpKernelContext* context) const { allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V)); } + OrtValue RotaryQKV; + OrtValue RotaryQ; + OrtValue RotaryK; + T* q_rotary = Q.GetMutable()->MutableData(); + T* k_rotary = packed_qkv ? nullptr : K.GetMutable()->MutableData(); if (do_rotary_) { rotary_embedding_helper::RotaryParameters rotary_params = {}; rotary_params.batch_size = batch_size; @@ -167,30 +172,22 @@ Status SparseAttention::Compute(OpKernelContext* context) const { const T* q_input; const T* k_input; - T* q_rotary; - T* k_rotary; if (packed_qkv) { - OrtValue RotaryQKV; TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV); q_input = Q.Get().Data(); k_input = q_input + num_heads_ * sequence_length * head_size; q_rotary = RotaryQKV.GetMutable()->MutableData(); k_rotary = q_rotary + num_heads_ * sequence_length * head_size; - Q = RotaryQKV; } else { - OrtValue RotaryQ; TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ); - OrtValue RotaryK; TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK); q_input = Q.Get().Data(); k_input = K.Get().Data(); q_rotary = RotaryQ.GetMutable()->MutableData(); k_rotary = RotaryK.GetMutable()->MutableData(); - Q = RotaryQ; - K = RotaryK; } ORT_RETURN_IF_ERROR(RunRotaryEmbedding(tp, rotary_params, q_input, @@ -221,9 +218,8 @@ Status SparseAttention::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator)); // Compute the attention score and apply the score to V - return ApplyAttention(Q.Get().Data(), packed_qkv ? nullptr : K.Get().Data(), - packed_qkv ? nullptr : V.Get().Data(), past_key, past_value, - output, present_key, present_value, + return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get().Data(), + past_key, past_value, output, present_key, present_value, total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context); } } // namespace contrib From 826a1e624082afc79c39e9e64ffde9e5be4fb62b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 13 Oct 2025 10:29:39 -0700 Subject: [PATCH 07/19] Add windows server to supported OS (#26275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add windows server to supported list to avoid confusing users: Marketing Name | Internal Version | platform.release().lower() | Release Year | Based on -- | -- | -- | -- | -- Windows Server 2025 | 10.0.26100+ | "2025server" | 2024–2025 | Windows 11 (24H2) Windows Server 2022 | 10.0.20348 | "2022server" | 2021 | Windows 10 (21H2) Windows Server 2019 | 10.0.17763 | "2019server" | 2018 | Windows 10 (1809) Windows Server 2016 | 10.0.14393 | "2016server" | 2016 | Windows 10 (1607) --- onnxruntime/python/onnxruntime_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py index 4a72916d3e485..6912d19897d67 100644 --- a/onnxruntime/python/onnxruntime_validation.py +++ b/onnxruntime/python/onnxruntime_validation.py @@ -23,9 +23,9 @@ def check_distro_info(): __my_distro__ = __my_system__ __my_distro_ver__ = platform.release().lower() - if __my_distro_ver__ not in ["10", "11"]: + if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]: warnings.warn( - f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only." + f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above." ) elif __my_system__ == "linux": """Although the 'platform' python module for getting Distro information works well on standard OS images From 96f56ba000231e8566b2249367f5d3223108b44f Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Mon, 13 Oct 2025 19:44:51 +0100 Subject: [PATCH 08/19] Reworked sgemm_kleidi memory allocations to reuse memory buffers (#26166) ### **Key changes** This PR makes changes to KleidiAI integration within the existing sgemm_kleidiai.cpp implementation. It was noted that during internal testing that memory allocation overhead due to repeated allocations of vectors was having a negative impact on performance figures. The changes introduce thread local buffers for reusing memory during inference. Android platforms are particularly sensitive to this, we have observed inference times being significantly impacted due to memory allocation overheads ### Example performance All runs were captured using onnxruntime_perf_test e.g. onnxruntime_perf_test -v -e cpu -I -m times -x 1 -y 1 -r 1000 **Android Platform** image In addition to this on M4 we have also observed slight improvements on models, however its the gain is not as significant as the allocation overhead is lower in terms of total time on that platform **Mac Mini M4** image **Onnxruntime Mlas Benchmark** Mlas Benchmark was executed on a Mac Mini M4 with SME2 instructions Tested code with and without changes in pr and observed the following results (subset shown) comparison generated using compare.py located in google benchmark repo tools `./onnxruntime_mlas_benchmark --benchmark_filter="SGEMM/NORMAL*" --benchmark_repetitions=100` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New -------------------------------------------------------------------------------------------------------------------------------------------------- SGEMM/NORMAL_NoTrans/M:63/N:63/K:63/real_time -0.1897 -0.1897 3270 2650 3270 2650 SGEMM/NORMAL_NoTrans/M:255/N:63/K:63/real_time -0.1468 -0.1469 8383 7152 8382 7151 SGEMM/NORMAL_NoTrans/M:1023/N:63/K:63/real_time -0.1506 -0.1506 19072 16200 19072 16200 SGEMM/NORMAL_NoTrans/M:63/N:255/K:63/real_time -0.1957 -0.1957 7742 6227 7742 6227 SGEMM/NORMAL_NoTrans/M:255/N:255/K:63/real_time -0.1032 -0.1032 14323 12845 14322 12845 SGEMM/NORMAL_TransB/M:63/N:63/K:63/real_time -0.2221 -0.2221 3356 2611 3356 2610 SGEMM/NORMAL_TransB/M:255/N:63/K:63/real_time -0.0439 -0.0438 8602 8224 8601 8224 SGEMM/NORMAL_TransB/M:1023/N:63/K:63/real_time +0.0436 +0.0436 16488 17206 16487 17206 SGEMM/NORMAL_TransB/M:63/N:255/K:63/real_time -0.2000 -0.1999 8046 6437 8046 6437 SGEMM/NORMAL_TransB/M:255/N:255/K:63/real_time -0.0979 -0.0979 14131 12747 14130 12747 SGEMM/NORMAL_TransB/M:1023/N:255/K:63/real_time -0.2836 -0.2836 62540 44802 62540 44802 SGEMM/NORMAL_TransB/M:63/N:1023/K:63/real_time -0.2183 -0.2183 15342 11993 15342 ``` Some small regressions have been seen but are difficult to explain, suspected machine variance during run could account for things like ``` SGEMM/NORMAL_TransB/M:1023/N:63/K:63/real_time +0.0436 +0.0436 16488 17206 16487 17206 ``` For example, as part of testing these results sgemm_kleidi.cpp was instrumented (after the previous benchmark results) with timer code, in MlasGemmBatch, MlasGemmPackB, and MlasGemmPackBSize. Which produced the following, indicating that the code performs better in this case on average than baseline which is currently in main ``` Head of main Function Count Avg (ns) Avg (pretty) ---------------------------------------------------------- MlasGemmBatch 42664 19601.015 19.601 us MlasGemmPackB 42664 373.943 373.943 ns MlasGemmPackBSize 42664 17.179 17.179 ns TLB changes Function Count Avg (ns) Avg (pretty) ---------------------------------------------------------- MlasGemmBatch 55492 16985.256 16.985 us MlasGemmPackB 55492 344.800 344.800 ns MlasGemmPackBSize 55492 16.788 16.788 ns ``` --------- Signed-off-by: Jonathan Clohessy --- .../core/mlas/lib/kleidiai/mlasi_kleidiai.h | 34 ++++++ .../core/mlas/lib/kleidiai/sgemm_kleidiai.cpp | 100 +++++++++++------- 2 files changed, 96 insertions(+), 38 deletions(-) diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index 5136061c4769d..2e9c4574fd057 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -115,3 +115,37 @@ MlasConv( MLAS_THREADPOOL* ThreadPool ); } + +/*++ + +Routine Description: + + This routine determines if a wraparound will occur when multiplying two size_t variables + Uses __builtin_mul_overflow if available on the current system and if not falls back + to a default implementation to check this wraparound. + +Arguments: + + a - Supplies the first number to be muliplied. + + b - Supplies the second number to be muliplied. + + out - pointer to a size_t which acts as the return value in success cases. + +Return Value: + + Returns false if the operation was successful + Returns true if wraparound of size_t was detected + +--*/ +inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) { +#if defined(__has_builtin) +# if __has_builtin(__builtin_mul_overflow) + return __builtin_mul_overflow(a, b, out); +# endif +#endif + // Fallback to manual check if builtin not available + if (b != 0 && a > SIZE_MAX / b) return true; + if (out) *out = a * b; + return false; +} diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index ea38f16205a7c..435ff1fb10017 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -14,6 +14,16 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h" #include "mlasi_kleidiai.h" + +// Thread-local reusable buffers to reduce allocation overhead across tiles. +struct KaiTlsBuffers { + std::vector output_tile; + std::vector bias_zero; + std::vector rhs_packed; + std::vector lhs_packed; +}; +static thread_local KaiTlsBuffers g_kai_tls; + size_t MLASCALL ArmKleidiAI::MlasGemmPackBSize( @@ -51,7 +61,6 @@ Return Value: // Compute the number of bytes required to hold the packed buffer. // size_t bytes = 0; - if (TransA == CblasNoTrans) { switch (TransB) { case CblasNoTrans: @@ -125,15 +134,15 @@ Return Value: const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - // pass zeroed bias values - const std::vector bias(N); + // Ensure size and zero the used span. + g_kai_tls.bias_zero.resize(N, 0.0f); switch (TransB) { case CblasNoTrans: - kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; case CblasTrans: - kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; default: return false; @@ -225,22 +234,29 @@ Return Value: size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - if (M < m_step && N < n_step && !Data->BIsPacked) { + if ((M < m_step || N < n_step) && !Data->BIsPacked) { // Fallback to MLAS return false; } - std::vector KaiPackedData; - KaiPackedData.resize(BatchSize); - size_t LhsPackedStride = 0; std::byte* LhsPackedData = nullptr; LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr); - auto LhsPacked = std::make_unique(LhsPackedStride * BatchSize); - LhsPackedData = LhsPacked.get(); - std::unique_ptr RhsPacked{nullptr}; + size_t lhs_resize = 0; + if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize)) + { + // size_t wraparound detected for LhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.lhs_packed.resize(lhs_resize); + LhsPackedData = g_kai_tls.lhs_packed.data(); + + // RHS packed buffer: use TLS reusable vector to minimize allocations + size_t RhsPackedStride = 0; + std::byte* RhsPackedData = nullptr; // It is assumed all B batches require packing or not if (Data[0].BIsPacked) { @@ -248,36 +264,31 @@ Return Value: MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) { std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); - KaiPackedData[batch_idx].B = Data[batch_idx].B; }); } else { // Multithread pack lhs and rhs - size_t RhsPackedStride = 0; - std::byte* RhsPackedData = nullptr; - RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K); - RhsPacked = std::make_unique(RhsPackedStride * BatchSize); - RhsPackedData = RhsPacked.get(); + size_t rhs_resize = 0; + if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize)) + { + // size_t wraparound detected for RhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.rhs_packed.resize(rhs_resize); + RhsPackedData = g_kai_tls.rhs_packed.data(); MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) { - // lhs odd, rhs even if (batch_idx & 0x1) { batch_idx >>= 1; - std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); - kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); } else { batch_idx >>= 1; - std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]); - - ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr); - - KaiPackedData[batch_idx].B = reinterpret_cast(RhsPackedPtr); + ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, + reinterpret_cast(Data[batch_idx].B), + Data[batch_idx].ldb, RhsPackedPtr); } }); } @@ -303,6 +314,14 @@ Return Value: dim[1] = MlasDivRoundup(M, m_step); dim[2] = MlasDivRoundup(N, n_step); + // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop. + // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively. + size_t max_tile_elems = 0; + if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) { + // size_t wraparound detected for tile size, fallback to MLAS + return false; + } + MlasTrySimpleParallel(ThreadPool, static_cast(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) { // compute B,M,N index from iteration index ptrdiff_t BIdx = tid / (dim[1] * dim[2]); @@ -314,18 +333,18 @@ Return Value: UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K) : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K); - auto BTile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].B) + rhs_packed_offset - ); + const std::byte* B_base = Data[0].BIsPacked + ? reinterpret_cast(Data[BIdx].B) + : (RhsPackedData + RhsPackedStride * BIdx); + auto BTile = reinterpret_cast(B_base + rhs_packed_offset); // Get lhs tile, A const size_t lhs_packed_offset = UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K) : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K); - auto ATile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].A) + lhs_packed_offset - ); + const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx; + auto ATile = reinterpret_cast(A_base + lhs_packed_offset); auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step; auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step; @@ -336,9 +355,14 @@ Return Value: MIdx * m_step * Data[BIdx].ldc * sizeof(float) + NIdx * n_step * sizeof(float) ); - // Allocate temporary buffer for raw A*B result - std::vector OutputTile(TileSizeM * TileSizeN, 0.0f); - float* temp_tile = OutputTile.data(); + // Allocate temporary buffer for raw A*B result (TLS reusable buffer) + size_t tile_elems = TileSizeM * TileSizeN; + + // resize the tile to the required size + g_kai_tls.output_tile.resize(tile_elems); + + float* temp_tile = g_kai_tls.output_tile.data(); + std::fill_n(temp_tile, tile_elems, 0.0f); if (UseSME2) { kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa( From 311b4a647690ea4ee53dc6cb99cc9b9b1d1c6218 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 13 Oct 2025 14:16:31 -0700 Subject: [PATCH 09/19] [CUDA] Upgrade build pipelines to use CUDA 12.8 + cuDNN 9.8 (#26267) This upgrades CUDA 12.2 + cuDNN 9.5 to CUDA 12.8 + cuDNN 9.8 in CI pipelines, so that we can build 120-real to support Blackwell GPU. To speed up build, we also disable relocatable-device-code. MSVC is updated to latest for some windows build pipelines. #### Known issues Some onnx models (yolo v3, yolo v4, mobilenet v1) failed to run due to cudnn frontend failed to find engine plan. We will try upgrade cudnn frontend later. Related failed tests are disabled for now. --------- Co-authored-by: Changming Sun --- .github/workflows/publish-csharp-apidocs.yml | 2 +- .github/workflows/windows_cuda.yml | 26 +++++++++---------- .github/workflows/windows_openvino.yml | 2 +- .github/workflows/windows_qnn_x64.yml | 2 +- .github/workflows/windows_tensorrt.yml | 26 +++++++++---------- .../windows_x64_debug_build_x64_debug.yml | 2 +- .../windows_x64_release_build_x64_release.yml | 2 +- ...build_x64_release_ep_generic_interface.yml | 2 +- ..._x64_release_vitisai_build_x64_release.yml | 2 +- .../workflows/windows_x64_release_xnnpack.yml | 2 +- .github/workflows/windows_x86.yml | 2 +- cmake/onnxruntime_providers_cuda.cmake | 4 +-- .../gather_block_quantized_op_test.cc | 2 +- onnxruntime/test/providers/cpu/model_tests.cc | 9 ++++++- .../test/providers/cpu/tensor/cast_op_test.cc | 2 +- .../build-perf-test-binaries-pipeline.yml | 2 +- .../c-api-noopenmp-packaging-pipelines.yml | 6 ++--- .../c-api-noopenmp-test-pipelines.yml | 12 ++++----- .../cuda-packaging-pipeline.yml | 10 +++---- .../custom-nuget-packaging-pipeline.yml | 18 +++---------- .../azure-pipelines/jar_package_testing.yml | 4 +-- ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 12 ++++----- .../nuget/templates/test_linux.yml | 6 ++--- .../azure-pipelines/post-merge-jobs.yml | 6 ++--- .../py-cuda-package-test-pipeline.yml | 4 +-- .../py-cuda-packaging-pipeline.yml | 2 +- .../jobs/py-linux-cuda-package-test-job.yml | 14 +++++----- .../stages/nodejs-linux-packaging-stage.yml | 6 ++--- .../nuget-linux-cuda-packaging-stage.yml | 12 ++++----- .../stages/py-gpu-packaging-stage.yml | 6 ++--- .../stages/py-linux-gpu-stage.yml | 6 ++--- .../stages/py-win-gpu-stage.yml | 10 +++---- .../jobs/download_win_gpu_library.yml | 8 +++--- .../templates/jobs/set-winenv.yml | 2 +- .../py-packaging-linux-test-cuda.yml | 6 ++--- .../azure-pipelines/templates/win-ci.yml | 4 +-- .../win-gpu-doc-gen-ci-pipeline.yml | 4 +-- ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 6 ++--- .../linux/docker/Dockerfile.manylinux2_28_cpu | 2 +- .../docker/Dockerfile.manylinux2_28_rocm | 2 +- .../docker/Dockerfile.manylinux2_28_webgpu | 2 +- .../docker/Dockerfile.package_ubuntu_2004_gpu | 2 +- .../inference/aarch64/default/cpu/Dockerfile | 2 +- .../inference/aarch64/python/cpu/Dockerfile | 2 +- .../inference/x86_64/default/cpu/Dockerfile | 2 +- .../x86_64/default/cuda12/Dockerfile | 2 +- .../inference/x86_64/python/cpu/Dockerfile | 2 +- .../x86_64/python/openvino/Dockerfile | 2 +- .../github/windows/setup_env_cuda.bat | 8 +++--- .../ci_build/github/windows/setup_env_gpu.bat | 8 +++--- .../ci_build/github/windows/setup_env_trt.bat | 6 ++--- 51 files changed, 145 insertions(+), 150 deletions(-) diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml index 42d1bdc295785..683c5594e82f2 100644 --- a/.github/workflows/publish-csharp-apidocs.yml +++ b/.github/workflows/publish-csharp-apidocs.yml @@ -20,7 +20,7 @@ permissions: jobs: build: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] env: DOCFXVERSION: 2.62.2 steps: diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 437fc0e2c6334..3d24d4b6b75b6 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU CUDA CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -52,9 +52,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - uses: actions/setup-node@v5 with: @@ -111,7 +111,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -188,10 +188,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -199,9 +199,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - name: Set OnnxRuntimeBuildDirectory shell: pwsh @@ -227,7 +227,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml index 395ccfbe70244..b608c0879aa45 100644 --- a/.github/workflows/windows_openvino.yml +++ b/.github/workflows/windows_openvino.yml @@ -18,7 +18,7 @@ concurrency: jobs: BUILD_OPENVINO_EP: name: Windows OpenVINO CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 240 env: AZCOPY_AUTO_LOGIN_TYPE: MSI diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml index 9788792b94fa8..1906fcb18c841 100644 --- a/.github/workflows/windows_qnn_x64.yml +++ b/.github/workflows/windows_qnn_x64.yml @@ -18,7 +18,7 @@ concurrency: jobs: build_test_qnn_ep: name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }}) - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 120 strategy: matrix: diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 5f3dcb9607a47..2a1fe97d9b7b7 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU TensorRT CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -56,9 +56,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - uses: actions/setup-node@v5 @@ -116,7 +116,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -193,10 +193,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -208,9 +208,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - name: Set OnnxRuntimeBuildDirectory @@ -237,7 +237,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml index 6165375e7a54a..6a1b43e54ed89 100644 --- a/.github/workflows/windows_x64_debug_build_x64_debug.yml +++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_debug: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml index f9d7b0d9e9e04..0bcd282e8dc50 100644 --- a/.github/workflows/windows_x64_release_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml index 54c13e1e04b0a..3934047266f59 100644 --- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml +++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_ep_generic_interface: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml index 06230962b39be..1c38d8e58970c 100644 --- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_vitisai: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml index 21033ef4cbe3c..6eb9f00d3997d 100644 --- a/.github/workflows/windows_x64_release_xnnpack.yml +++ b/.github/workflows/windows_x64_release_xnnpack.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_xnnpack: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml index fa1e9362e2f34..597c1c7f4b6cf 100644 --- a/.github/workflows/windows_x86.yml +++ b/.github/workflows/windows_x86.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x86_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 68a3e9014b7b0..1d31eb1fbd207 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -182,8 +182,8 @@ # Since CUDA 12.8, compiling diagnostics become stricter if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - target_compile_options(${target} PRIVATE "$<$:--relocatable-device-code=true>") - set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_options(${target} PRIVATE "$<$:--static-global-template-stub=false>") + if (MSVC) target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4505>") endif() diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc index 574ec49da67ea..3bf37ea193245 100644 --- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc +++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc @@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector& data, const std::vector& s ORT_ENFORCE(static_cast(data.size()) == total_elements, "Data size does not match the shape", "Data size: ", data.size(), ", Expected size: ", total_elements, - ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name()); + ", Shape: ", VectorToString(shape), " Name:", name); } // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cf49601e6c671..ca1a3104e0bed 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -678,7 +678,14 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("fp16_coreml_FNS-Candy"), ORT_TSTR("fp16_test_tiny_yolov2"), ORT_TSTR("fp16_test_shufflenet"), - ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"), + // models from model zoo. #26274: cuDNN frontend no valid engine + ORT_TSTR("YOLOv3"), + ORT_TSTR("YOLOv3-12"), + ORT_TSTR("YOLOv4"), + ORT_TSTR("SSD-MobilenetV1"), + ORT_TSTR("SSD-MobilenetV1-12")}; + // For ROCm EP, also disable the following tests due to flakiness, // mainly with precision issue and random memory access fault. static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc index 8f4c4ff0896ba..289e94397fb39 100644 --- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -1477,7 +1477,7 @@ template void CastOpTestFloatFloat4(std::vector shape, std::vector float_data, bool is_fp4_input = false) { - size_t num_pairs = float_data.size() / 2; + int num_pairs = static_cast(float_data.size()) / 2; int num_fp4_elements = static_cast((float_data.size() + 1) / 2); bool is_odd_count = (float_data.size() % 2 != 0); diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml index 53b62762319ba..e54216fe4ef4e 100644 --- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml @@ -31,5 +31,5 @@ stages: machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU' extra_build_arg: '' cmake_build_type: Release - cuda_version: 12.2 + cuda_version: 12.8 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 91736752e22d4..086d65c93062b 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -73,12 +73,12 @@ variables: - name: ReleaseVersionSuffix value: '' - name: win_trt_version - value: 12.2 + value: 12.8 - name: win_trt_home value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home - value: $(Agent.TempDirectory)\v12.2 + value: $(Agent.TempDirectory)\v12.8 extends: # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks. # For non-production pipelines, use "Unofficial" as defined below. @@ -142,7 +142,7 @@ extends: - template: stages/nuget-combine-cuda-stage.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} win_trt_home: ${{ variables.win_trt_home }} diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml index 46363c07b3e3e..7e107c33ed8c0 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml @@ -127,7 +127,7 @@ stages: NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_win.yml parameters: @@ -136,7 +136,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' MoreSuffix: '_Windows' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -144,7 +144,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -153,7 +153,7 @@ stages: StageSuffix: 'GPU' MoreSuffix: '_Linux' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' - CudaVersion: 12.2 + CudaVersion: 12.8 @@ -202,7 +202,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -257,7 +257,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index 5535d7b4f264d..d7fc0efbf45ea 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -48,9 +48,9 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml @@ -59,13 +59,13 @@ variables: - name: win_trt_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml index 1ad6f411d9848..5ce6ec278b1e7 100644 --- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: QnnSdk displayName: QNN SDK Version @@ -40,8 +40,8 @@ variables: - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: @@ -178,9 +178,6 @@ extends: inputs: targetType: 'inline' script: | - mkdir -p $(Build.BinariesDirectory)/osx-x64 - Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64 - mkdir -p $(Build.BinariesDirectory)/osx-arm64 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64 @@ -200,12 +197,6 @@ extends: foreach ($dir in $dirs) { Write-Host "Directory: $($dir.FullName)" } - $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64* - if ($osx_x64_archive.Count -eq 0) { - Write-Host "No osx-x64 archive found." - } else { - Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)" - } $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64* if ($osx_arm64_archive.Count -eq 0) { Write-Host "No osx-arm64 archive found." @@ -233,13 +224,10 @@ extends: script: | Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64 - $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName - tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName - $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64" Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64" diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml index d387c07d6dc6e..463c02203e21a 100644 --- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml +++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml @@ -40,7 +40,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -105,7 +105,7 @@ stages: - name: runCodesignValidationInjection value: false - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 timeoutInMinutes: 60 steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index 0410001d77d13..5e6671e3797ce 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,21 +31,21 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} jobs: diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index 89ce3f3c86727..b60ef7576184e 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -58,9 +58,9 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu Context: tools/ci_build/github/linux/docker/ - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: DockerBuildArgs: " - --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04 + --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }} --build-arg BUILD_UID=$( id -u ) " @@ -107,4 +107,4 @@ stages: DisableContribOps: $(DisableContribOps) DisableMlOps: $(DisableMlOps) IsReleaseBuild: $(IsReleaseBuild) - PACKAGENAME: ${{ parameters.NugetPackageName }} \ No newline at end of file + PACKAGENAME: ${{ parameters.NugetPackageName }} diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index deb8b84bf19b8..fdfafd4d9a179 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -2,16 +2,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} stages: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index c2c89686a077e..02b6a6df76611 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,8 +18,8 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 - cuda_version: '12.2' + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 + cuda_version: '12.8' - stage: Republish_Wheels dependsOn: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml index 4c536bad45368..290af4a3e4449 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml @@ -49,4 +49,4 @@ extends: - template: stages/py-gpu-packaging-stage.yml parameters: cmake_build_type: ${{ parameters.cmake_build_type }} - cuda_version: '12.2' + cuda_version: '12.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index 858de4d173484..b53aee639372d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -2,9 +2,9 @@ parameters: - name: CudaVersion displayName: 'CUDA version' type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: machine_pool type: string @@ -44,13 +44,13 @@ jobs: - template: ../../templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} pool: ${{ parameters.machine_pool }} steps: @@ -105,4 +105,4 @@ jobs: inputs: targetType: filePath filePath: tools/ci_build/github/linux/run_python_dockertest.sh - arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2 + arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8 diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml index bca95a4a2fd02..8cbb81ba89c12 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' stages: - stage: Linux_Nodejs_Packaging_x64 @@ -20,14 +20,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index 121e80fca1021..b1e5f541b90e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: buildJava type: boolean - name: buildNodejs @@ -22,7 +22,7 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} @@ -74,14 +74,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self @@ -140,12 +140,12 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index d3d4b8f5b64d5..3c5cf591039e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -19,9 +19,9 @@ parameters: - name: cuda_version type: string displayName: 'CUDA version. Windows Only.' - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: PythonVersions type: object @@ -48,4 +48,4 @@ stages: extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml index 715470eb9f012..ab1fb919af413 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml @@ -22,9 +22,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 stages: - stage: Linux_py_GPU_Wheels_${{ parameters.arch }} @@ -55,7 +55,7 @@ stages: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index e2683c04f21f2..c3957fc8341de 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -20,9 +20,9 @@ parameters: default: '' - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: cmake_build_type type: string @@ -47,7 +47,7 @@ stages: workspace: clean: all pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows templateContext: sdl: @@ -76,7 +76,7 @@ stages: - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} - name: trt_build_flag ${{ if eq(parameters.use_tensorrt, true) }}: @@ -119,7 +119,7 @@ stages: --cmake_generator "$(VSGenerator)" --enable_pybind --enable_onnx_tests - --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40 + --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }} workingDirectory: '$(Build.BinariesDirectory)' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index 681138a5ab3d1..be213337091e8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -7,10 +7,10 @@ parameters: default: false - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 - name: TrtVersion type: string default: '10.9.0.34' @@ -46,11 +46,11 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 96436883fb8b8..d7c940cda30f4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -9,7 +9,7 @@ parameters: default: false - name: PrimaryCUDAVersion type: string - default: '12.2' + default: '12.8' # - name: SecondaryCUDAVersion # type: string # default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 1415586521f30..263f73a9e29b0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -18,9 +18,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 # TODO: Ideally it should fetch information from the build that triggers it - name: cmake_build_type @@ -46,7 +46,7 @@ jobs: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} workspace: clean: all diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 0310735d94b2e..ca698123a04e7 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -78,7 +78,7 @@ parameters: default: '11.8' values: - 11.8 - - 12.2 + - 12.8 - name: SpecificArtifact displayName: Use Specific Artifact @@ -136,7 +136,7 @@ stages: ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}: pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows ${{ else }}: pool: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml index c20f4a2c1bd19..8b320b0ceb4ac 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml @@ -32,10 +32,10 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 stages: - stage: kernelDocumentation diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index c12bb3552920c..08953749f6527 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,16 +31,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} jobs: diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index 2a65e7c26b20b..a277286866e41 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm index 3337af3be6074..5410bd64036ce 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ARG ROCM_VERSION=6.2.3 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu index 0007a4e06f7c0..07ad8e933baf0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 8a84b9b940306..5d98c25b535af 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04 +ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 ARG TRT_VERSION=10.9.0.34-1+cuda12.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 8b2083c2ccfc1..cef2d11780969 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index f5143d5ac9ab9..79d99d08dcc4e 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index cfc2ce7079148..72d98206f9205 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index 8401393a661b1..85f4a074e30bf 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2 FROM $BASEIMAGE ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index b923febc1227f..81ba47f397f91 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile index f3341f32a768d..5ad1023bfb5b2 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile @@ -1,5 +1,5 @@ # Use the specified UBI8 base image with GCC 14 -ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1" +ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2" FROM ${BASEIMAGE} ARG BUILD_UID=1000 diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index f93938e2a9009..f095f58f9920e 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,13 +1,13 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( -set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index ecadab5d3f8a3..115a19b6f3a01 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -1,14 +1,14 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 45e0d970fb541..6110249a9cde6 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,10 +1,10 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64 ) else ( - set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 + set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64 ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false From 24d7aee69dda788f0e7d78aaee8b19e579a83d20 Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Mon, 13 Oct 2025 19:50:22 -0400 Subject: [PATCH 10/19] [MIGraphX EP] Link FP4 types between OnnxRT and MIGraphX APIs (#26231) Do this so that MIGraphX can take in fp4 types from input/output tensors and then use that to perform an inference via the MIGraphX API. ### Description Mirroed changes going into ROCm 7.1 build. Cherry -picked mainline OnnxRT changes to get fp4 tensor support before adding this ontop. Moving this to mainline OnnxRt to enable the MIGraphX EP to allow for fp4 input/output tensors https://github.com/ROCm/onnxruntime/pull/176 ### Motivation and Context Add fp4 support to MIGraphX EP --- .../core/providers/migraphx/migraphx_execution_provider.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index a59347841be95..239a5054801bc 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) { case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT: + case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2: @@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type, case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ: mgx_type = migraphx_shape_fp8e5m2fnuz_type; break; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1: + mgx_type = migraphx_shape_fp4x2_type; + break; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4: mgx_type = migraphx_shape_int8_type; break; From 4c7fc43941ab181e4a06deb7a99d0bf483d66318 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Tue, 14 Oct 2025 00:50:38 +0100 Subject: [PATCH 11/19] Fix build.sh --parallel 1 incorrectly triggering parallel build (#26264) ### Description This PR fixes an issue where running ```bash bash build.sh ...... --parallel 1 ...... ``` still triggers a parallel build. The previous logic only added -j when num_parallel_jobs != 1, which caused Ninja/Make/Xcode to use all CPU cores by default. ### Motivation and Context When building ONNX Runtime, using parallel 4 caused an out-of-memory (OOM) error in my computer. However, changing it to parallel 1 still triggered parallel compilation and caused OOM again. --- tools/ci_build/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 54dd23b07a363..8a72ab70cc67d 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1333,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe cmd_args.extend(["--target", *targets]) build_tool_args = [] - if num_parallel_jobs != 1: + if num_parallel_jobs != 0: if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm: # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests # not maxing out CL_MPCount From 94de31fa124c29e79c9583e3273a1a60a7e8f7b0 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Mon, 13 Oct 2025 16:52:41 -0700 Subject: [PATCH 12/19] Bump onnx to 1.19.1 (#26202) ~~Test rel-1.19.1~~ Bump to ONNX==1.19.1 --- cmake/deps.txt | 2 +- .../external/onnxruntime_external_deps.cmake | 8 +--- .../onnx/avoid_regenerating_proto_files.patch | 46 ------------------- cmake/patches/onnx/onnx.patch | 12 ++--- .../onnx/avoid_regenerating_proto_files.patch | 46 ------------------- cmake/vcpkg-ports/onnx/binskim.patch | 12 ++--- cmake/vcpkg-ports/onnx/portfile.cmake | 5 +- cmake/vcpkg-ports/onnx/vcpkg.json | 2 +- docs/How_To_Update_ONNX_Dev_Notes.md | 2 +- onnxruntime/test/onnx/TestCase.cc | 17 ++++++- onnxruntime/test/onnx/main.cc | 18 -------- .../onnx_backend_test_series_filters.jsonc | 29 +++--------- .../python/cpu/scripts/requirements.txt | 2 +- .../docker/scripts/lort/requirements.txt | 2 +- .../docker/scripts/manylinux/requirements.txt | 2 +- .../linux/docker/scripts/requirements.txt | 2 +- .../github/linux/python/requirements.txt | 2 +- .../github/windows/python/requirements.txt | 2 +- 18 files changed, 45 insertions(+), 166 deletions(-) delete mode 100644 cmake/patches/onnx/avoid_regenerating_proto_files.patch delete mode 100644 cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch diff --git a/cmake/deps.txt b/cmake/deps.txt index 7b243ff15cd80..bf76753c1b3c0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 -onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1 +onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09 # Use the latest commit of 10.9-GA onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index f32350ca755ea..b6a741d8b0fe7 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -498,13 +498,7 @@ else() endif() if(Patch_FOUND) - set(ONNXRUNTIME_ONNX_PATCH_COMMAND - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch && - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < - ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch - ) + set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch) else() set(ONNXRUNTIME_ONNX_PATCH_COMMAND "") endif() diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/patches/onnx/onnx.patch +++ b/cmake/patches/onnx/onnx.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/vcpkg-ports/onnx/binskim.patch +++ b/cmake/vcpkg-ports/onnx/binskim.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake index 27f5ea5fadd79..882850963a0c0 100644 --- a/cmake/vcpkg-ports/onnx/portfile.cmake +++ b/cmake/vcpkg-ports/onnx/portfile.cmake @@ -4,12 +4,9 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO onnx/onnx REF "v${VERSION}" - SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0 + SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60 PATCHES fix-cmakelists.patch - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - avoid_regenerating_proto_files.patch fix-dependency-protobuf.patch binskim.patch ) diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json index 350db2e35061a..ad0d1aaf15f51 100644 --- a/cmake/vcpkg-ports/onnx/vcpkg.json +++ b/cmake/vcpkg-ports/onnx/vcpkg.json @@ -1,6 +1,6 @@ { "name": "onnx", - "version-semver": "1.19.0", + "version-semver": "1.19.1", "port-version": 1, "description": "Open standard for machine learning interoperability", "homepage": "https://onnx.ai", diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md index 8da19ddc51cb7..8c1280431c384 100644 --- a/docs/How_To_Update_ONNX_Dev_Notes.md +++ b/docs/How_To_Update_ONNX_Dev_Notes.md @@ -35,7 +35,7 @@ git add onnx 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch). 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version. 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.) -4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl +4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl Alternatively, directly run Terrapin to upload ONNX package (need SHA512): diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index 6df98ff505fa1..cbb25bb9b629e 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -1435,9 +1435,22 @@ std::unique_ptr> GetBrokenTests(const std::string& provider broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"}); // Fails since ONNX==1.19.0 broken_tests->insert({"l2normalization_axis_0", "unknown version"}); + broken_tests->insert({"attention_3d_gqa", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"}); broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"}); } #ifdef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index b6f2cb2683677..463634b370d4c 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); // Please make no more changes to the list static const ORTCHAR_T* immutable_broken_tests[] = { - // pending ONNX update - ORT_TSTR("attention_3d_gqa"), - ORT_TSTR("attention_3d_gqa_attn_mask"), - ORT_TSTR("attention_3d_gqa_causal"), - ORT_TSTR("attention_3d_gqa_scaled"), - ORT_TSTR("attention_3d_gqa_softcap"), - ORT_TSTR("attention_3d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_gqa"), - ORT_TSTR("attention_4d_gqa_attn_mask"), - ORT_TSTR("attention_4d_gqa_causal"), - ORT_TSTR("attention_4d_gqa_scaled"), - ORT_TSTR("attention_4d_gqa_softcap"), - ORT_TSTR("attention_4d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"), - ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"), - // unsupported case ORT_TSTR("AvgPool1d"), ORT_TSTR("AvgPool1d_stride"), ORT_TSTR("AvgPool2d"), diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index f5f6a3ae3bc39..0558d008a2275 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -31,23 +31,12 @@ "current_failing_tests": [ "^test_adagrad", "^test_adagrad_multiple", - "^test_attention_4d_diff_heads_mask4d_padded_kv*", // pending onnx update - "^test_attention_3d_gqa*", // pending onnx update - "^test_attention_3d_gqa_causal", // pending onnx update - "^test_attention_3d_gqa_scaled", // pending onnx update - "^test_attention_3d_gqa_softcap", // pending onnx update - "^test_attention_3d_gqa_with_past_and_present", // pending onnx update - "^test_attention_4d_gqa*", // pending onnx update - "^test_attention_4d_gqa_causal", // pending onnx update - "^test_attention_4d_gqa_scaled", // pending onnx update - "^test_attention_4d_gqa_softcap", // pending onnx update - "^test_attention_4d_gqa_with_past_and_present", // pending onnx update - "^test_attention_*causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*", // pending onnx update - "^test_attention_4d_attn_mask_3d_causal_expanded*", // pending onnx update "^test_attention_4d_fp16*", // precision issue: 1 / 192 mismatched elements "^test_attention_4d_fp16_expanded*", // precision issue: 3 / 192 mismatched elements + "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*", // webgpu mismatched elements 38 / 576 + "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*", // webgpu + "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu + "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen "^test_l2normalization*", // LpNormalization(22) not implemented "^test_l1normalization*", // LpNormalization(22) not implemented "^test_lpnormalization*", // LpNormalization(22) not implemented @@ -123,13 +112,9 @@ "^test_if_opt", "^test_loop16_seq_none", "^test_identity_opt", - // rotary dim should be fixed in onnx==1.19.1 - "^test_rotary_embedding_no_position_ids_rotary_dim", - "^test_rotary_embedding_with_interleaved_rotary_dim", - "^test_rotary_embedding_with_rotary_dim", - "^test_rotary_embedding_3d_input_expanded", - "^test_rotary_embedding_interleaved_expanded", - "^test_rotary_embedding_no_position_ids_interleaved_expanded", + "^test_rotary_embedding_3d_input_expanded", // win cuda fail + "^test_rotary_embedding_interleaved_expanded", // win cuda fail + "^test_rotary_embedding_no_position_ids_interleaved_expanded", // win cuda fail "^test_rotary_embedding_expanded", //webgpu "^test_rotary_embedding_no_position_ids_expanded", //webgpu // Following tests are for opset 16 ops and are not yet implemented in ORT diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt index bae6f4cb51816..1b1dadeaf8db2 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.8 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt index 2871f5cab2ea2..dc394ff50f4f9 100644 --- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt @@ -3,7 +3,7 @@ beartype==0.15.0 flatbuffers cerberus h5py -onnx==1.19.0 +onnx==1.19.1 # Python dependencies required for pytorch development astunparse expecttest!=0.2.0 diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 381d42831e715..2d89aece56340 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 4cc94f9148656..2fc034d9c5ca2 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -4,7 +4,7 @@ mypy pytest setuptools==78.1.1 wheel==0.45.1 -onnx==1.19.0 +onnx==1.19.1 argparse sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt index d48fb66194f2a..293aa49823d48 100644 --- a/tools/ci_build/github/linux/python/requirements.txt +++ b/tools/ci_build/github/linux/python/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index 6ab2ab2b7b61f..b48f6c3c2784d 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers From cd4ac4943465fbbadbbc622dcfd11f453a27a2c1 Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Tue, 14 Oct 2025 15:02:34 +0800 Subject: [PATCH 13/19] [webgpu] Enable indirect dispatch for flash attention (#26207) This pull request introduces support for indirect dispatch in the WebGPU FlashAttention implementation, enabling more dynamic and efficient kernel launches based on runtime sequence lengths. The changes add new logic and parameters to propagate sequence length information and indirect dispatch buffers through the attention pipeline, with conditional code paths to maintain compatibility with the existing direct dispatch approach. It's part of the work to enable graph capture in phi4 https://github.com/microsoft/onnxruntime/pull/25868 --- .../webgpu/bert/flash_attention.cc | 175 ++++++++++++++---- .../contrib_ops/webgpu/bert/flash_attention.h | 31 ++-- .../flash_attention_decode_qkt.wgsl.template | 14 +- ...sh_attention_decode_split_vx.wgsl.template | 18 +- ...h_attention_decode_vx_reduce.wgsl.template | 12 +- .../webgpu/bert/group_query_attention.cc | 2 +- .../core/providers/webgpu/compute_context.h | 5 +- 7 files changed, 193 insertions(+), 64 deletions(-) diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index b5c1f73d1678d..a9bd4afc5cd09 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform); const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape"); + // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output + if (prepare_indirect_dispatch_) { + shader.AddInput("seqlen_k", ShaderUsage::None); + shader.AddOutput("indirect_buffer", ShaderUsage::None); + } shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size") << " let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n" @@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { " let sequence_id = output_indices[2];\n" " let num_head_id = output_indices[1];\n" " let batch = output_indices[0];\n"; + if (prepare_indirect_dispatch_) { + shader.MainFunctionBody() << " let total_seq_length = u32(seqlen_k[0u]) + 1u;\n"; + } else { + shader.MainFunctionBody() << " let total_seq_length = uniforms.total_sequence_length;\n"; + } + + // Add indirect dispatch logic for thread 0 + if (prepare_indirect_dispatch_) { + // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size. + shader.MainFunctionBody() << " // Prepare indirect dispatch buffer for thread 0\n" + << " if (global_idx == 0u) {\n" + << " let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n" + << " indirect_buffer[0] = num_total_seq_length_tile;\n" + << " indirect_buffer[1] = uniforms.num_heads;\n" + << " indirect_buffer[2] = 1u;\n" + << " }\n\n"; + } + if (has_past_) { - shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n"; + shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n"; if (past_present_share_buffer_) { shader.MainFunctionBody() << " let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n" << " let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n" @@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters, const Tensor* K, const Tensor* past_key, Tensor* present_key, - const Tensor* V, const Tensor* past_value, Tensor* present_value) { + const Tensor* V, const Tensor* past_value, Tensor* present_value, + uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) { // CopyKVCache takes past key/value and current key/value and copies them to present key and value. // This makes it so that FlashAttention only needs to look at present key and value, and saves // number of input buffers in the shader, which we run out of (<=8) without this optimization. + // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention. const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1); bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0; // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length. @@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_; TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components}; int64_t copy_size = copy_kv_shape.Size(); - CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_}; + + // Determine if we need to prepare indirect dispatch + bool prepare_indirect_dispatch = (indirect_buffer != nullptr); + + CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_, + prepare_indirect_dispatch}; if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) { program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, components}}); @@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}}); } + + if (prepare_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } + if (has_past && !parameters.past_present_share_buffer_) { program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components}, {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); } program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components}, - {present_value, ProgramTensorMetadataDependency::Rank, components}}) - .AddIndices(std::move(copy_kv_shape)); + {present_value, ProgramTensorMetadataDependency::Rank, components}}); + + if (prepare_indirect_dispatch) { + program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None}); + } + + program.AddIndices(std::move(copy_kv_shape)); program.SetDispatchGroupSize(static_cast((copy_size + 63) / 64)) .SetWorkgroupSize(64) - .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_) + .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch) .AddUniformVariables({{static_cast(copy_size)}, - // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to - // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length. - {static_cast(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}}); + {static_cast(parameters.total_sequence_length_)}, + {static_cast(parameters.kv_sequence_length_)}, + {tile_size}, + {static_cast(parameters.num_heads_)}}); return context.RunProgram(program); } @@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } if (has_attention_bias_) { shader.AddInput("attention_bias", ShaderUsage::UseUniform); } @@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q, - const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, - const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile, uint32_t tile_size, - uint32_t present_sequence_length) { + const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k, + const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) { const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast(parameters.head_size_)) : parameters.scale_; const bool has_attention_bias = attention_bias != nullptr; const int components = 4; - FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size}; + FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch}; program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components}, {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } if (has_attention_bias) { program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank}); } @@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte {metadata, ProgramTensorMetadataDependency::Rank, 2}}); const uint32_t vectorized_head_size = parameters.head_size_ / components; - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .SetWorkgroupSize(64) - .CacheHint(tile_size, has_attention_bias) + if (use_indirect_dispatch) { + program.SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.SetWorkgroupSize(64) + .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch) .AddUniformVariables({{static_cast(vectorized_head_size)}, {static_cast(parameters.total_sequence_length_)}, {static_cast(alpha)}, present_sequence_length, {static_cast(parameters.n_reps)}, - {num_total_seq_length_tile}, {num_present_sequence_length_tile}, {static_cast(parameters.num_heads_)}}); @@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad shader.AddInput("metadata", ShaderUsage::UseUniform); shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("out_split_vx", ShaderUsage::UseUniform); const uint32_t tile_size_k_vec = 8u; @@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context, @@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte const Tensor* qk, Tensor* out_split_vx, Tensor* present_value, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, + const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, + bool use_indirect_dispatch, uint32_t present_sequence_length) { const int components = 4; int head_size_vec = parameters.v_head_size_ / components; - FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec}; + FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch}; program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2}, {qk, ProgramTensorMetadataDependency::TypeAndRank}, {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); // [B, N, split_k, head_size] - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .CacheHint(tile_size, head_size_vec) + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}) + .SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch) .SetWorkgroupSize(64) .AddUniformVariables({{static_cast(parameters.total_sequence_length_)}, {static_cast(head_size_vec)}, present_sequence_length, {static_cast(parameters.n_reps)}, - num_total_seq_length_tile, num_present_sequence_length_tile, {static_cast(parameters.num_heads_)}}); @@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("input", ShaderUsage::UseUniform); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template", - WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_)); + WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_), + WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context, const Tensor* out_split_vx, Tensor* output, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile) { + uint32_t num_present_sequence_length_tile, + uint32_t seq_tile_size, + bool use_indirect_dispatch) { const int components = 4; constexpr int tile_size = 8; int tile_head_size = tile_size * components; - FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size}; + FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch}; program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}}); const uint32_t num_head_size_tile = static_cast((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size); program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile) - .CacheHint(tile_size) + .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch) .SetWorkgroupSize(tile_size * tile_size) .AddUniformVariables({{static_cast(parameters.v_head_size_ / components)}, num_total_seq_length_tile, @@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias, Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value, - const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) { - ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value)); - + const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) { // Extract present_sequence_length directly from present_key tensor shape: // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size) const uint32_t present_sequence_length = static_cast(present_key->Shape()[2]); + if (parameters.sequence_length_ > 1) { const uint32_t tile_size = 64; + // For encode path, use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); bool has_attention_bias = attention_bias != nullptr; bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"}; bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"}; @@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co return context.RunProgram(program); } - // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled. + // For decode path (sequence_length == 1) const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_, parameters.sequence_length_, present_sequence_length}); const TensorShape qk_shape(qk_dims); @@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co constexpr uint32_t tile_size = 64; const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size; const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size; + + // Determine if we should use indirect dispatch + const bool use_indirect_dispatch = parameters.past_present_share_buffer_ && + seqlen_k != nullptr && + context.IsGraphCaptureEnabled(); + + // Create indirect dispatch buffer if using indirect dispatch + Tensor* indirect_buffer_ptr = nullptr; + Tensor indirect_buffer; + if (use_indirect_dispatch) { + const TensorShape indirect_buffer_shape{3}; // 3 uint32 values for dispatch dimensions + indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType(), indirect_buffer_shape); + indirect_buffer_ptr = &indirect_buffer; + // Use the fused CopyKVCache that also prepares the indirect dispatch buffer + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr)); + } else { + // Use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); + } + // The metadata is used to store the max and sum of each tile. const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, 2}); const TensorShape metadata_shape(metadata_dims); Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType(), metadata_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, - parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k, + parameters, indirect_buffer_ptr, num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch, present_sequence_length)); - const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_}); + const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, + num_present_sequence_length_tile, parameters.head_size_}); const TensorShape out_split_vx_shape(out_split_vx_dims); Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters, - num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length)); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, + seqlen_k, parameters, indirect_buffer_ptr, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, + use_indirect_dispatch, present_sequence_length)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch)); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index c75494df253c1..7d71dc0f4d42d 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu; class CopyKVCacheProgram final : public Program { public: - CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer) - : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) { + CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer, + bool prepare_indirect_dispatch = false) + : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32}, - {"past_sequence_length", ProgramUniformVariableDataType::Uint32}); + {"total_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"kv_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"tile_size", ProgramUniformVariableDataType::Uint32}, + {"num_heads", ProgramUniformVariableDataType::Uint32}); private: bool has_past_; bool kv_BNSH_; bool past_present_share_buffer_; + bool prepare_indirect_dispatch_; }; class FlashAttentionProgram final : public Program { @@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program { class FlashAttentionDecodeQKTProgram final : public Program { public: FlashAttentionDecodeQKTProgram(const std::string& kernel_name, - bool has_attention_bias, uint32_t tile_size) - : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) { + bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program { public: - FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec) - : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) { + FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program { public: - FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size) - : Program{kernel_name}, tile_size_(tile_size) { + FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program tile_qk: array; $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); +#if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; +#else + let total_sequence_length = uniforms.total_sequence_length; +#endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let q_offset = head_idx * uniforms.head_size_vec; - var total_sequence_length = uniforms.total_sequence_length; let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec; for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) { if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) { @@ -95,7 +101,7 @@ $MAIN { for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) { l_sum += exp(f32(tile_qk[i]) - l_max); } - let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile; + let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile; metadata[meta_offset] = metadata_value_t(l_max, l_sum); } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template index c7593af311ce2..37cf7e8f11b1f 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template @@ -5,6 +5,7 @@ #param head_size_vec #param tile_size_k_vec #param sub_tile_count +#param use_indirect_dispatch // Note that this shader adopts similar algorithm with dp4a generation shader. // @@ -40,9 +41,14 @@ var qkv_values: array, $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); - var total_sequence_length = uniforms.total_sequence_length; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + #else + let total_sequence_length = uniforms.total_sequence_length; + #endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length; // Calculate the global max and sum in qk. @@ -50,12 +56,12 @@ $MAIN { { var g_max = f32(-3.402823e+38f); var g_sum = f32(0); - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; g_max = max(g_max, metadata[meta_offset].x); } - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; let m_value = metadata[meta_offset]; @@ -95,7 +101,7 @@ $MAIN { } for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) { - let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i; + let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i; out_split_vx[out_offset] = tile_output[i]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template index a4381baa638ce..22f18655307de 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#param seq_tile_size #param tile_size +#param use_indirect_dispatch // Inputs are splits of the GQA output, split into num_total_seq_length_tiles // rows. This shader needs to add these splits across the row dimension to @@ -23,10 +25,16 @@ $MAIN { var value = output_value_t(0); let local_row = u32(local_idx / tile_size); let local_col = local_idx % tile_size; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size; + #else + let num_total_seq_length_tile = uniforms.num_total_seq_length_tile; + #endif if (head_size_offset + local_col < uniforms.head_size_vec) { - for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) { - if (r + local_row < uniforms.num_total_seq_length_tile) { + for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) { + if (r + local_row < num_total_seq_length_tile) { value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc index 8b7b257dd2852..cb845061404f3 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc @@ -206,7 +206,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& !use_sliding_window && CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) { return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value, - present_value, parameters, context); + present_value, parameters, context, seqlen_k); } Tensor qSplit; diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h index 315115390ff23..c4a88754deffe 100644 --- a/onnxruntime/core/providers/webgpu/compute_context.h +++ b/onnxruntime/core/providers/webgpu/compute_context.h @@ -8,6 +8,7 @@ #include #include "core/framework/execution_provider.h" +#include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/program.h" #include "core/providers/webgpu/webgpu_context.h" @@ -16,7 +17,6 @@ namespace onnxruntime { class Tensor; -class WebGpuExecutionProvider; namespace webgpu { @@ -42,6 +42,9 @@ class ComputeContext { inline bool HasFeature(wgpu::FeatureName feature) const { return webgpu_context_.DeviceHasFeature(feature); } + inline bool IsGraphCaptureEnabled() const { + return ep_.IsGraphCaptureEnabled(); + } #if !defined(__wasm__) inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const { return webgpu_context_.SubgroupMatrixConfigs(); From 5d786dfd39c4a2b561139a1cc6509d0f5d27eeef Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:20:00 -0400 Subject: [PATCH 14/19] [MIGraphX EP ] Add support for QLinearAveragePool and QLinearGlobalAveragePool (#26162) ### Description Add support for MIgraphX EP operators QLinearGlobalAveragePool and QLinaerAveragePool ops ### Motivation and Context Want support for these operators through MIGraphX EP and MIGraphX --- .../core/providers/migraphx/migraphx_execution_provider.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 239a5054801bc..55f901164bdac 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -953,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer, "QLinearAdd", "QLinearConv", "QLinearMatMul", + "QLinearAveragePool", + "QLinearGlobalAveragePool", "QuantizeLinear", "QuickGelu", "DynamicQuantizeLinear", From 04ed484f739b0807b47ad3abed67da63218ae9f0 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Wed, 15 Oct 2025 01:21:24 +0800 Subject: [PATCH 15/19] [WebNN] Fix some issues in reduction ops (#26289) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Allow empty axes input - When axes is empty and ‘noop_with_empty_axes’ is true, WebNN should set axes to [] - Simplify the code --- js/web/test/suite-test-list.jsonc | 110 +++++++++--------- .../core/providers/webnn/builders/helper.h | 2 +- .../builders/impl/reduction_op_builder.cc | 102 ++++++++-------- 3 files changed, 105 insertions(+), 109 deletions(-) diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 3f1face2a043c..80991a3ebbb5f 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -2147,66 +2147,66 @@ "test_reduce_log_sum_default", "test_reduce_log_sum_desc_axes", // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64. - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", "test_reduce_log_sum_negative_axes", "test_reduce_log_sum", "test_reduce_max_default_axes_keepdim_example", - // "test_reduce_max_default_axes_keepdims_random", - // "test_reduce_max_do_not_keepdims_example", - // "test_reduce_max_do_not_keepdims_random", - // "test_reduce_max_keepdims_example", - // "test_reduce_max_keepdims_random", - // "test_reduce_max_negative_axes_keepdims_example", - // "test_reduce_max_negative_axes_keepdims_random", - // "test_reduce_mean_default_axes_keepdims_example", - // "test_reduce_mean_default_axes_keepdims_random", - // "test_reduce_mean_do_not_keepdims_example", - // "test_reduce_mean_do_not_keepdims_random", - // "test_reduce_mean_keepdims_example", - // "test_reduce_mean_keepdims_random", - // "test_reduce_mean_negative_axes_keepdims_example", - // "test_reduce_mean_negative_axes_keepdims_random", - // "test_reduce_min_default_axes_keepdims_example", - // "test_reduce_min_default_axes_keepdims_random", - // "test_reduce_min_do_not_keepdims_example", - // "test_reduce_min_do_not_keepdims_random", - // "test_reduce_min_keepdims_example", - // "test_reduce_min_keepdims_random", - // "test_reduce_min_negative_axes_keepdims_example", - // "test_reduce_min_negative_axes_keepdims_random", - // "test_reduce_prod_default_axes_keepdims_example", - // "test_reduce_prod_default_axes_keepdims_random", - // "test_reduce_prod_do_not_keepdims_example", - // "test_reduce_prod_do_not_keepdims_random", - // "test_reduce_prod_keepdims_example", - // "test_reduce_prod_keepdims_random", - // "test_reduce_prod_negative_axes_keepdims_example", - // "test_reduce_prod_negative_axes_keepdims_random", - // "test_reduce_sum_default_axes_keepdims_example", - // "test_reduce_sum_default_axes_keepdims_random", - // "test_reduce_sum_do_not_keepdims_example", - // "test_reduce_sum_do_not_keepdims_random", + "test_reduce_max_default_axes_keepdims_random", + "test_reduce_max_do_not_keepdims_example", + "test_reduce_max_do_not_keepdims_random", + "test_reduce_max_keepdims_example", + "test_reduce_max_keepdims_random", + "test_reduce_max_negative_axes_keepdims_example", + "test_reduce_max_negative_axes_keepdims_random", + "test_reduce_mean_default_axes_keepdims_example", + "test_reduce_mean_default_axes_keepdims_random", + "test_reduce_mean_do_not_keepdims_example", + "test_reduce_mean_do_not_keepdims_random", + "test_reduce_mean_keepdims_example", + "test_reduce_mean_keepdims_random", + "test_reduce_mean_negative_axes_keepdims_example", + "test_reduce_mean_negative_axes_keepdims_random", + "test_reduce_min_default_axes_keepdims_example", + "test_reduce_min_default_axes_keepdims_random", + "test_reduce_min_do_not_keepdims_example", + "test_reduce_min_do_not_keepdims_random", + "test_reduce_min_keepdims_example", + "test_reduce_min_keepdims_random", + "test_reduce_min_negative_axes_keepdims_example", + "test_reduce_min_negative_axes_keepdims_random", + "test_reduce_prod_default_axes_keepdims_example", + "test_reduce_prod_default_axes_keepdims_random", + "test_reduce_prod_do_not_keepdims_example", + "test_reduce_prod_do_not_keepdims_random", + "test_reduce_prod_keepdims_example", + "test_reduce_prod_keepdims_random", + "test_reduce_prod_negative_axes_keepdims_example", + "test_reduce_prod_negative_axes_keepdims_random", + "test_reduce_sum_default_axes_keepdims_example", + "test_reduce_sum_default_axes_keepdims_random", + "test_reduce_sum_do_not_keepdims_example", + "test_reduce_sum_do_not_keepdims_random", "test_reduce_sum_empty_axes_input_noop_example", "test_reduce_sum_empty_axes_input_noop_random", - // "test_reduce_sum_keepdims_example", - // "test_reduce_sum_keepdims_random", - // "test_reduce_sum_negative_axes_keepdims_example", - // "test_reduce_sum_negative_axes_keepdims_random", - // "test_reduce_sum_square_default_axes_keepdims_example", - // "test_reduce_sum_square_default_axes_keepdims_random", - // "test_reduce_sum_square_do_not_keepdims_example", - // "test_reduce_sum_square_do_not_keepdims_random", - // "test_reduce_sum_square_keepdims_example", - // "test_reduce_sum_square_keepdims_random", - // "test_reduce_sum_square_negative_axes_keepdims_example", - // "test_reduce_sum_square_negative_axes_keepdims_random", + "test_reduce_sum_keepdims_example", + "test_reduce_sum_keepdims_random", + "test_reduce_sum_negative_axes_keepdims_example", + "test_reduce_sum_negative_axes_keepdims_random", + "test_reduce_sum_square_default_axes_keepdims_example", + "test_reduce_sum_square_default_axes_keepdims_random", + "test_reduce_sum_square_do_not_keepdims_example", + "test_reduce_sum_square_do_not_keepdims_random", + "test_reduce_sum_square_keepdims_example", + "test_reduce_sum_square_keepdims_random", + "test_reduce_sum_square_negative_axes_keepdims_example", + "test_reduce_sum_square_negative_axes_keepdims_random", // "test_reflect_pad", "test_relu", "test_reshape_allowzero_reordered", diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index baedb98a34c28..fbabc23504636 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type); // Collects all the initializer tensors in the subGraph and its ancestor graphs. InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer); -inline std::vector HandleNegativeAxes(const std::vector& axes, size_t input_size) { +inline std::vector HandleNegativeAxes(const gsl::span axes, size_t input_size) { std::vector new_axes(axes.size()); for (size_t i = 0; i < axes.size(); ++i) { new_axes[i] = HandleNegativeAxis(axes[i], input_size); diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc index 6ea9b0a440d93..d07e636d578b1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc @@ -19,6 +19,8 @@ namespace webnn { class ReductionOpBuilder : public BaseOpBuilder { // Add operator related. public: + // Allow axes potentially being empty inputs that are ignored during processing. + ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {} void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override; // Add operator related. @@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons const auto& input_defs = node.InputDefs(); if (input_defs.size() > 1) { model_builder.AddInitializerToSkip(input_defs[1]->Name()); // axes + model_builder.AddInputToSkip(input_defs[1]->Name()); // axes } } @@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); const auto keep_dims = helper.Get("keepdims", 1); + emscripten::val options = emscripten::val::object(); options.set("label", node.Name()); options.set("keepDimensions", keep_dims == 1); - std::vector axes_data; - - emscripten::val output = emscripten::val::object(); + std::vector axes_data; const auto opset = node.SinceVersion(); const auto& op_type = node.OpType(); if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) { // 'axes' is an optional input. - const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0); - if (!GetTensorName(input_defs, 1).empty()) { - // Optional input axes is provided, use axes initializer data. - const auto& initializers(model_builder.GetInitializerTensors()); - const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); - Initializer axes_initializer(axes_tensor); - const auto axes_data_span = axes_initializer.DataAsSpan(); - std::transform( - axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); - } else { - if (noop_with_empty_axes) { - // When axes is empty and this attribute is set to true, input tensor will not be reduced. - output = input; - model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); - return Status::OK(); + std::vector axes_shape; + if (TensorExists(input_defs, 1)) { + ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes"); + if (axes_shape[0] != 0) { + // Optional input axes is provided and we already ensure it is an initializer. + // Use that initializer data. + const auto& initializers(model_builder.GetInitializerTensors()); + const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); + Initializer axes_initializer(axes_tensor); + const auto axes_data_span = axes_initializer.DataAsSpan(); + axes_data = HandleNegativeAxes(axes_data_span, input_rank); } } } else { if (helper.HasAttr("axes")) { - auto axes = helper.Get("axes", std::vector{}); - std::transform( - axes.begin(), axes.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); + axes_data = GetResolvedAxes(helper, input_rank); } } - if (axes_data.size() > 0) { - options.set("axes", emscripten::val::array(axes_data)); - } - if (op_type == "ReduceL1") { - output = model_builder.GetBuilder().call("reduceL1", input, options); - } else if (op_type == "ReduceL2") { - output = model_builder.GetBuilder().call("reduceL2", input, options); - } else if (op_type == "ReduceLogSum") { - output = model_builder.GetBuilder().call("reduceLogSum", input, options); - } else if (op_type == "ReduceLogSumExp") { - output = model_builder.GetBuilder().call("reduceLogSumExp", input, options); - } else if (op_type == "ReduceMax") { - output = model_builder.GetBuilder().call("reduceMax", input, options); - } else if (op_type == "ReduceMean") { - output = model_builder.GetBuilder().call("reduceMean", input, options); - } else if (op_type == "ReduceMin") { - output = model_builder.GetBuilder().call("reduceMin", input, options); - } else if (op_type == "ReduceProd") { - output = model_builder.GetBuilder().call("reduceProduct", input, options); - } else if (op_type == "ReduceSum") { - output = model_builder.GetBuilder().call("reduceSum", input, options); - } else if (op_type == "ReduceSumSquare") { - output = model_builder.GetBuilder().call("reduceSumSquare", input, options); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type); + // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute: + // - If it is false, perform reduction over all dimensions. + // (In WebNN, this means the 'axes' option is not set.) + // - If it is true, no reduction is applied, but other operations are still performed. + // (In WebNN, this requires setting 'axes' to an empty array.) + if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) { + options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64(axes_data))); } + const std::string_view webnn_op_type = GetWebNNOpType(op_type); + ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type"); + + emscripten::val output = model_builder.GetBuilder().call( + std::string(webnn_op_type).c_str(), input, options); + model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } @@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); - const std::string axes_name = GetTensorName(input_defs, 1); - // If the optional input 'axes' is provided, it must be an initializer. - if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) { - LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; - return false; + + if (TensorExists(input_defs, 1)) { + std::vector axes_shape; + if (!GetShape(*input_defs[1], axes_shape, logger)) { + LOGS(logger, VERBOSE) << "Cannot get shape of input axes"; + return false; + } + + if (axes_shape.size() != 1) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D"; + return false; + } + + const std::string axes_name = GetTensorName(input_defs, 1); + // If the optional input 'axes' is provided and not empty, it must be an initializer. + if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; + return false; + } } return true; From aafdb3a517d03b270b1bd9d648615d8bbfd7d619 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Tue, 14 Oct 2025 11:10:57 -0700 Subject: [PATCH 16/19] Fix shape inference failure with in-memory external data (#26263) ## Description Fixes #26261 This PR resolves a regression introduced in v1.23.0 where models with Constant nodes containing tensors larger than 127 bytes fail to load with a shape inference error. ### Root Cause Commit 3b97d79b3c (PR #25320) introduced an optimization to convert large Constant node tensors (> 127 bytes) into OrtValues with in-memory external data references for better memory management. However, ONNX shape inference cannot distinguish between in-memory and file-based external data, and rejects any TensorProto with `data_location = EXTERNAL`. ### The Fix Modified `InferenceContextImpl::getInputData()` to: 1. Detect tensors with in-memory external data using `utils::HasExternalDataInMemory()` 2. Retrieve the corresponding OrtValue 3. Create a temporary TensorProto with embedded data (not external reference) 4. Provide this temporary proto to ONNX shape inference This allows ONNX shape inference to access the actual tensor data without rejecting it as external. ### Memory Impact This fix introduces a minor and temporary increase in memory usage during the model loading phase. - **When:** The additional memory is allocated only when the shape inference engine needs to access the data of a constant tensor that is larger than 127 bytes. This is a one-time event during the initial analysis of the model. - **What:** The fix creates a temporary in-memory copy of the tensor data. - **Duration:** This temporary copy is released as soon as shape inference is complete. The impact on the overall peak memory usage of the application is expected to be negligible. The memory usage during inference is not affected. While it is theoretically possible for the temporary tensor to be large if a multi-gigabyte constant tensor is used for shape inference, this is a highly unlikely scenario in practice for well-designed models. ### Testing - Tested with the problematic model from issue #26261 - All optimization levels now work correctly (DISABLE_ALL, BASIC, EXTENDED, ALL) - Unit tests to be added ### Changes - **onnxruntime/core/graph/graph.cc**: - Modified `getInputData()` method in `InferenceContextImpl` class - Added `temp_tensor_protos_` member to store temporary TensorProtos during shape inference ## TODO - [ ] Add unit tests - [ ] Run full test suite --------- Co-authored-by: Dmitri Smirnov --- onnxruntime/core/graph/graph.cc | 26 +++ onnxruntime/test/ir/graph_test.cc | 258 ++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 3f6443aa73d4c..8b599dc86d997 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { // only return data if it's for a constant initializer. checks for outer scope initializers // if this is a subgraph and the name isn't found locally. const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true); + if (initializer != nullptr) { + // Check if this is in-memory external data (data stored in OrtValue) + // ONNX shape inference cannot handle external data, so we need to materialize it + if (utils::HasExternalDataInMemory(*initializer)) { + // Try to get the OrtValue for this initializer + OrtValue ort_value; + if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) { + // Create a temporary TensorProto with the actual data from the OrtValue + // This allows ONNX shape inference to access the data + const Tensor& tensor = ort_value.Get(); + auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false); + // Store the temporary proto so it outlives this call, maintain pointers steady + temp_tensor_protos_.push_back(std::make_unique(std::move(temp_tensor_proto))); + return temp_tensor_protos_.back().get(); + } else { + // If we can't get the OrtValue, it is a bug + ORT_THROW("Initializer ", def->Name(), + " has in-memory external data but cannot get OrtValue during shape inference"); + } + } + } return initializer; } @@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { std::vector> graph_inferencers_; const Graph& graph_; const Graph::ResolveOptions& options_; + // Temporary TensorProtos created for in-memory external data during shape inference + // These need to outlive the shape inference call, so we store them here + // Inference is per node and the instance of this context is on the stack, + // so this is safe. + mutable InlinedVector> temp_tensor_protos_; }; Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph, diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 4fd9830440846..7371ad5cf0ded 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -2,13 +2,17 @@ // Licensed under the MIT License. #include +#include #include "core/common/inlined_containers.h" #include "core/common/span_utils.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" #include "core/graph/op.h" +#include "core/session/inference_session.h" +#include "core/session/environment.h" #include "test/providers/provider_test_utils.h" +#include "test/test_environment.h" #include "gtest/gtest.h" #include "gmock/gmock.h" #include "onnx/defs/function.h" @@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener #endif +// Test for shape inference with in-memory external data (issue #26261) +// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes) +// stored as in-memory external data would cause shape inference to fail +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) { + // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes) + // This will trigger the in-memory externalization path + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + // Add the value attribute with a tensor + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); // 16 elements * 8 bytes = 128 bytes + // Each split will be size 1, totaling 16 + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node that uses the constant as input + // Split requires constant input for the 'split' parameter, which triggers shape inference + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); // Use constant as split sizes + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + // Add axis attribute + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load the model - this should succeed with the fix + // Before the fix, this would fail with: + // "Cannot parse data from external tensors. Please load external data into raw data for tensor" + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + // Verify the graph was properly constructed + Graph& graph = model->MainGraph(); + ASSERT_STATUS_OK(graph.Resolve()); + + // Verify the constant node was converted to an initializer + const ONNX_NAMESPACE::TensorProto* initializer = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer)); + ASSERT_NE(initializer, nullptr); + + // Verify the Split node can access the constant data during shape inference + const Node* split_node_ptr = nullptr; + for (const auto& node : graph.Nodes()) { + if (node.Name() == "split_node") { + split_node_ptr = &node; + break; + } + } + ASSERT_NE(split_node_ptr, nullptr); + + // Verify outputs are properly shaped + ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u); +} + +// Test for shape inference with in-memory external data using InferenceSession +// This test more accurately reproduces the issue by going through the full session initialization +// which includes graph optimizations that trigger the in-memory externalization +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) { + // Create the same model as above + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Save to a temporary file + const std::string model_path = "test_in_memory_external_data.onnx"; + { + std::ofstream file(model_path, std::ios::binary); + ASSERT_TRUE(file.is_open()); + ASSERT_TRUE(model_proto.SerializeToOstream(&file)); + } + + // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix + SessionOptions so; + so.graph_optimization_level = TransformerLevel::Default; // This triggers the issue + so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession"; + + InferenceSession session_object{so, GetEnvironment()}; + + // This should succeed with the fix, fail without it + ASSERT_STATUS_OK(session_object.Load(model_path)); + ASSERT_STATUS_OK(session_object.Initialize()); + + // Clean up + std::remove(model_path.c_str()); +} + +// Test that explicitly triggers the in-memory externalization and then shape inference +// This test directly reproduces the bug scenario +TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) { + // Create a model with a Split node that depends on a constant initializer + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create initializer directly (not as Constant node) with 128 bytes + auto* initializer = graph_proto->add_initializer(); + initializer->set_name("split_sizes"); + initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + initializer->add_dims(16); // 16 * 8 = 128 bytes + for (int64_t i = 0; i < 16; ++i) { + initializer->add_int64_data(1); + } + + // Create a Split node that uses this initializer + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("split_sizes"); // Uses the large initializer + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load model + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + Graph& graph = model->MainGraph(); + // First resolve should succeed + ASSERT_STATUS_OK(graph.Resolve()); + + // Now trigger the in-memory externalization + // This converts initializers > 127 bytes to OrtValues with external data references + Status convert_status = graph.ConvertInitializersIntoOrtValues(); + ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage(); + + // Check if the initializer was actually externalized + const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after)); + ASSERT_NE(initializer_after, nullptr); + // Debug: verify it was externalized + ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after)) + << "Initializer was not externalized to in-memory external data"; + + // Mark the graph as needing resolve to force shape inference to run again + graph.SetGraphResolveNeeded(); + + // Resolve again - this should trigger shape inference with the externalized initializer + // Without the fix, this will fail with "Cannot parse data from external tensors" + // With the fix, getInputData() materializes the external data for shape inference + Status second_resolve = graph.Resolve(); + ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage(); +} + } // namespace test } // namespace onnxruntime From 654137fab69f80265e5195deba209a635b4edb97 Mon Sep 17 00:00:00 2001 From: Yateng Hong Date: Wed, 15 Oct 2025 04:41:01 +0800 Subject: [PATCH 17/19] [TensorRT] Fix DDS output bug during engine update (#26272) ### Description Fix a bug in the TRT Execution Provider where the DDS output tensor was not bound after an engine update. ### Motivation and Context The `dds_output_allocator_map` is not cleared on engine update, so that it will mis-recognized as a known DDS and will not bind the output allocation. Script to reproduce the issue: ```:python # create an onnx model with: # inputs: data -> NonZeros(data) -> GatherND -> output # then run the model with onnxruntime def create_model(): import onnx from onnx import helper, TensorProto input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"]) output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"]) nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node") transpose_node = helper.make_node( "Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node" ) gathernd_node = helper.make_node( "GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node" ) value_info = [ helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]), helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]), ] graph = helper.make_graph( [nonzeros_node, transpose_node, gathernd_node], "test_graph", [input], [output], value_info=value_info, ) model = helper.make_model(graph) onnx.save(model, "model_dds.onnx") def run_model(): import onnxruntime as ort import numpy as np sess = ort.InferenceSession("model_dds.onnx", providers=["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]) print("Running with data shape (3,4)") data = np.random.randn(3, 4).astype(np.float32) sess.run(None, {"data": data}) print("Running with data shape (5,6)") data = np.random.randn(5, 6).astype(np.float32) sess.run(None, {"data": data}) create_model() run_model() ``` Before the change: > IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter check failed, condition: mContext.profileObliviousBindings.at(profileObliviousIndex) || getPtrOrNull(mOutputAllocators, profileObliviousIndex). Neither address or allocator is set for output tensor scores. Call setOutputTensorAddress, setTensorAddress or setOutputAllocator before enqueue/execute.) ... Status Message: TensorRT EP execution context enqueue failed. --- .../tensorrt/tensorrt_execution_provider.cc | 4 ++ .../providers/tensorrt/tensorrt_basic_test.cc | 46 +++++++++++++++++++ .../test/testdata/ort_github_issue_26272.py | 26 +++++++++++ .../testdata/ort_github_issue_26272_dds.onnx | 28 +++++++++++ 4 files changed, 104 insertions(+) create mode 100644 onnxruntime/test/testdata/ort_github_issue_26272.py create mode 100644 onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 508d932459bf9..cd0c0e4bffdb5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior. trt_state->context->reset(); trt_state->engine->reset(); + + // Clear dds output allocator map since the engine and context will be recreated. + dds_output_allocator_map.clear(); + auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); if (max_workspace_size_ > 0) { trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index 327dfab96c2d1..a746493d779f8 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) { ASSERT_TRUE(status.IsOK()); } +TEST(TensorrtExecutionProviderTest, DDSOutputTest) { + PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx"); + SessionOptions so; + so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + InferenceSession session_object{so, GetEnvironment()}; + auto cuda_provider = DefaultCudaExecutionProvider(); + auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1]; + std::vector dims_op_x = {3, 4}; + std::vector values_op_x(12, 0.f); // 12=3*4 + OrtValue ml_value_x; + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("data", ml_value_x)); + + // prepare outputs + std::vector output_names; + output_names.push_back("output"); + std::vector fetches; + + OrtTensorRTProviderOptionsV2 params; + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // First pass run + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + + // Second pass run with new shape + dims_op_x = {6, 4}; + values_op_x.resize(24, 0.f); // 24=6*4 + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + feeds.clear(); + + feeds.insert(std::make_pair("data", ml_value_x)); + + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); +} + TEST_P(TensorrtExecutionProviderCacheTest, Run) { // GetParam() returns the parameter of following format: // ##cache type##_##input shape type## diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py new file mode 100644 index 0000000000000..fa381e5df1094 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272.py @@ -0,0 +1,26 @@ +import onnx +from onnx import TensorProto, helper + +# Create a simple ONNX model with DDS output +input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"]) +output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"]) + +nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node") +transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node") +gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node") + +value_info = [ + helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]), + helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]), +] + +graph = helper.make_graph( + [nonzeros_node, transpose_node, gathernd_node], + "test_graph", + [input], + [output], + value_info=value_info, +) + +model = helper.make_model(graph) +onnx.save(model, "ort_github_issue_26272_dds.onnx") diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx new file mode 100644 index 0000000000000..371f99c537898 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx @@ -0,0 +1,28 @@ + +:“ +( +datanonzeros nonzeros_node"NonZero +1 +nonzeros +nonzeros_ttranspose_node" Transpose +3 +data + +nonzeros_toutput gathernd_node"GatherND +test_graphZ +data + +d1 +d2b +output +  +nzrj +nonzeros + + +nzrj + +nonzeros_t + +nzr +B \ No newline at end of file From f0015b956723ec98efa8292a0628f99a5b6e8b3d Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Wed, 15 Oct 2025 16:52:09 +0800 Subject: [PATCH 18/19] [webgpu] And int64 to cast (#25610) This pull request extends the WebGPU execution provider to support int64 data type casting in the `Cast` operator, with conditional support based on whether graph capture is enabled. It refactors kernel registration to allow toggling int64 support and updates the shader code and kernel logic to handle int64 tensors efficiently. It's part of the work to enable graph capture in phi4 https://github.com/microsoft/onnxruntime/pull/25868 --- .../core/providers/webgpu/shader_variable.cc | 2 +- .../core/providers/webgpu/tensor/cast.cc | 154 ++++++++++-------- .../core/providers/webgpu/tensor/cast.h | 9 +- .../webgpu/webgpu_execution_provider.cc | 28 ++-- 4 files changed, 114 insertions(+), 79 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc index 5998c22a0d2ca..aa1f6c9a0ec0b 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.cc +++ b/onnxruntime/core/providers/webgpu/shader_variable.cc @@ -378,7 +378,7 @@ std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std:: ORT_THROW("Invalid type"); break; case onnxruntime::webgpu::ProgramVariableDataType::Int64: - ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));"; + ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));"; break; case onnxruntime::webgpu::ProgramVariableDataType::Uint64: ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), 0u);"; diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc index 313a96ba25509..daf4aa323c12e 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.cc +++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc @@ -11,75 +11,29 @@ namespace onnxruntime { namespace webgpu { namespace { -const std::vector& CastOpTypeConstraints() { - // currently support boolean, integer and float types that explicitly allowed in WGSL: +const std::vector& CastOpTypeConstraints(bool enable_graph_capture) { + // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL: // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section - // - static std::vector types{ + static std::vector base_types{ DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}; - return types; + + if (enable_graph_capture) { + static std::vector types_with_int64 = []() { + auto types = base_types; + types.push_back(DataTypeImpl::GetTensorType()); + return types; + }(); + return types_with_int64; + } else { + return base_types; + } } } // namespace -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 6, 8, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 9, 12, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 13, 18, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 19, 20, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 21, 22, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_KERNEL_EX( - Cast, - kOnnxDomain, - 23, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); - Status Cast::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); auto* output_tensor = context.Output(0, input_tensor->Shape()); @@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const { if (size == 0) { return Status::OK(); } + bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType(); + const int in_components = is_from_int64 ? 1 : 4; + const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4; uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); + uint32_t in_vec_size = onnxruntime::narrow(in_components == 1 ? size : vec_size); + uint32_t out_vec_size = onnxruntime::narrow(out_components == 1 ? size : vec_size); - CastProgram program{to_}; + CastProgram program{to_, is_from_int64}; program - .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}) - .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4}) + .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components}) + .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components}) .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({ {static_cast(vec_size)}, @@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const { case ONNX_NAMESPACE::TensorProto_DataType_BOOL: expression = "vec4(a)"; break; + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + expression = "int32(a)"; + break; default: ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported."); } - sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size") - << " let a = " << input.GetByOffset("global_idx") << ";\n " - << output.SetByOffset("global_idx", expression); + + sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"); + if (is_from_int64_) { + sh.MainFunctionBody() << " let a0 = " << input.GetByOffset("global_idx * 4") << ";\n" + << " let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n" + << " let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n" + << " let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n" + << " let a = vec4(a0, a1, a2, a3);\n"; + } else { + sh.MainFunctionBody() << " let a = " << input.GetByOffset("global_idx") << ";\n"; + } + if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n" + << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n" + << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n" + << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n"; + } else { + sh.MainFunctionBody() << output.SetByOffset("global_idx", expression); + } return Status::OK(); } +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) { + const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture); + + KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr& out) -> Status { + out = std::make_unique(info); + return Status::OK(); + }; + + if constexpr (StartVersion == EndVersion) { + // Non-versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } else { + // Versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion, EndVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } +} + +// Explicit template instantiations +template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool); +template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool); +template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool); +template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool); +template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool); +template KernelCreateInfo CreateCastKernelInfo<23>(bool); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h index 925cd200f0aba..7dfb50e3241c8 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.h +++ b/onnxruntime/core/providers/webgpu/tensor/cast.h @@ -3,6 +3,8 @@ #pragma once +#include "core/framework/kernel_registry.h" +#include "core/framework/op_kernel.h" #include "core/providers/webgpu/webgpu_kernel.h" namespace onnxruntime { @@ -10,7 +12,7 @@ namespace webgpu { class CastProgram final : public Program { public: - CastProgram(int32_t to) : Program{"Cast"}, to_{to} {} + CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {} Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -18,6 +20,7 @@ class CastProgram final : public Program { private: int32_t to_; + bool is_from_int64_; }; class Cast final : public WebGpuKernel { @@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel { int32_t to_; }; +// Create Cast kernel info with appropriate type constraints based on graph capture support +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index bbb3fbdd221d3..0f7607ac1dbfe 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -28,6 +28,7 @@ #include "core/providers/webgpu/data_transfer.h" #include "core/providers/webgpu/external_data_loader.h" #include "core/providers/webgpu/webgpu_profiler.h" +#include "core/providers/webgpu/tensor/cast.h" namespace onnxruntime { @@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND); -std::unique_ptr RegisterKernels() { +std::unique_ptr RegisterKernels(bool enable_graph_capture = false) { auto kernel_registry = std::make_unique(); static const BuildKernelCreateInfoFn function_table[] = { @@ -464,13 +465,6 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO(13, Tanh), KERNEL_CREATE_INFO(1, Not), - KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast), - KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast), - KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast), - KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast), - KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast), - KERNEL_CREATE_INFO(23, Cast), - // // activations BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -771,6 +765,14 @@ std::unique_ptr RegisterKernels() { } } + // Register Cast kernels with conditional int64 support based on graph capture + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture))); + #ifndef DISABLE_CONTRIB_OPS Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry); ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage()); @@ -869,9 +871,13 @@ std::vector> WebGpuExecutionProvider::GetCapa } std::shared_ptr WebGpuExecutionProvider::GetKernelRegistry() const { - static std::shared_ptr registry = webgpu::RegisterKernels(); - - return registry; + if (enable_graph_capture_) { + static std::shared_ptr registry = webgpu::RegisterKernels(true); + return registry; + } else { + static std::shared_ptr registry = webgpu::RegisterKernels(false); + return registry; + } } std::unique_ptr WebGpuExecutionProvider::GetDataTransfer() const { From 036fde03d0fff715091a430d3fa44e4383defea3 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 15 Oct 2025 14:40:48 -0700 Subject: [PATCH 19/19] [Build] Lock torch, onnxscript and onnx-ir versions to latest (#26315) To fix build pipeline error `ModuleNotFoundError: No module named 'onnxscript._framework_apis.torch_2_9'` after recent torch 2.9 release. This locks torch version to 2.8, and also updates onnxscript and onnx-ir to latest versions. I locked torchvision version since it is usually installed with torch together. If torch and torchvision are not compatible, there might be errors in transformers script. --- tools/ci_build/build.py | 2 +- .../ci_build/github/linux/docker/scripts/requirements.txt | 4 ++-- tools/ci_build/github/linux/python/requirements.txt | 4 ++-- tools/ci_build/github/windows/python/requirements.txt | 4 ++-- .../requirements/transformers-test/requirements.txt | 7 ++++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 8a72ab70cc67d..591be538ac873 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1751,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): # Install cpu only version of torch when cuda is not enabled in Linux. extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"] run_subprocess( - [sys.executable, "-m", "pip", "install", "torch", *extra], + [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra], cwd=cwd, dll_path=dll_path, python_path=python_path, diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 2fc034d9c5ca2..c19c0170291e6 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -10,5 +10,5 @@ sympy==1.14 flatbuffers protobuf==4.25.1 packaging -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt index 293aa49823d48..3ddce9cc0ec31 100644 --- a/tools/ci_build/github/linux/python/requirements.txt +++ b/tools/ci_build/github/linux/python/requirements.txt @@ -8,7 +8,7 @@ protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index b48f6c3c2784d..bb307a20d7f18 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -8,8 +8,8 @@ protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe semver diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index bcd5a434c58e8..21894c2ba003d 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -3,12 +3,13 @@ packaging # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt protobuf==4.25.1 numpy==2.2.6 -torch>=2.6.0 +torch==2.8.0 +torchvision==0.23.0 coloredlogs==15.0 transformers==4.52.1 parameterized>=0.8.1 sentencepiece psutil einops -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10