From a2e46f4a20e975d29a960712f6388c8671622140 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 10 Oct 2025 00:25:09 -0700
Subject: [PATCH 01/19] fix gather_nd on webgpu ep (#26270)

fixes gather_nd on webgpu ep
(found by transformers.js for the vision encoder of docling)
---
 onnxruntime/core/providers/webgpu/tensor/gather_nd.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
index 7c3aced3f0295..cab1dc03848b9 100644
--- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
@@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const {
   data_dim += indices_innerest_dim_;
 
   for (uint32_t i = 0; i < static_cast<uint32_t>(data.Rank() - data_dim); i++) {
-    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
+    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
   }
 
   shader.MainFunctionBody() << "  " << output.SetByOffset("global_idx", data.GetByIndices("data_indices"));

From 8d4c5baf52f0cd974c470da8eb3a7cf64e072cb8 Mon Sep 17 00:00:00 2001
From: qti-hungjuiw <hungjuiw@qti.qualcomm.com>
Date: Fri, 10 Oct 2025 23:28:51 +0800
Subject: [PATCH 02/19] Make local mirror of cmake dependencies configurable
 (#26042)

### Description
<!-- Describe your changes. -->
- Added support for the `--cmake_deps_mirror_dir` option to allow users
to specify a custom local directory for CMake dependencies.
- Improved logging to show the source of `FetchContent` in CMake.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
- Previously, ONNX Runtime searched for CMake dependencies only in the
default `<repo_root>/mirror` directory.
- This change enables users to configure an alternative location for
storing CMake dependencies, offering greater flexibility in build
environments.
---
 cmake/CMakeLists.txt                           | 7 +++++--
 cmake/external/helper_functions.cmake          | 6 +++---
 cmake/external/onnxruntime_external_deps.cmake | 2 +-
 tools/ci_build/build.py                        | 3 +++
 tools/ci_build/build_args.py                   | 1 +
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 116d369885a27..8186da507a442 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
 endif()
 
 
-# Single output director for all binaries
+# Single output directory for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
 
+# Local mirror directory of cmake dependencies
+set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
+set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies")
+
 
 include(FetchContent)
 
@@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD)
   set(onnxruntime_MINIMAL_BUILD ON)
 endif()
 
-set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
 set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
 set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)
 set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining)
diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index 55059b9500a8e..e8044411e4201 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -4,11 +4,11 @@
 # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects.
 
 function(onnxruntime_fetchcontent_declare contentName)
+    cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "")
+    message(STATUS "Fetch ${contentName} from ${ARG_URL}")
     FetchContent_Declare(${ARGV})
     string(TOLOWER ${contentName} contentNameLower)
-    list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR)
-    if(index_SOURCE_SUBDIR GREATER_EQUAL 0)
-      cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") 
+    if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "")
       set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE)
     endif()
 endfunction()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 8e1a880579b34..f32350ca755ea 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
 
     if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
       # Search a local mirror folder
-      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+      string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
 
       if(EXISTS "${LOCAL_URL}")
         cmake_path(ABSOLUTE_PATH LOCAL_URL)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 327caf83c7850..54dd23b07a363 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1015,6 +1015,9 @@ def generate_build_tree(
     if path_to_protoc_exe:
         cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"]
 
+    if args.cmake_deps_mirror_dir:
+        cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"]
+
     if args.fuzz_testing:
         if not (
             args.build_shared_lib
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index c5454903474d1..05d5052067b2e 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None:
         help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines",
     )
     parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.")
+    parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.")
     parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.")
     parser.add_argument(
         "--enable_symbolic_shape_infer_tests",

From ac7f4b7763289bfb82fc3f5165d82895d6adde42 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Sat, 11 Oct 2025 02:37:00 +0800
Subject: [PATCH 03/19] [WebNN] Remove constraints for Gemm's C input (#26273)

Now WebNN implementation for gemm's C operand has supported
unidirectional broadcasting, which is align with ONNX spec. Removing
constraints for Gemm's C input as which should be covered in ORT kernel.
---
 js/web/docs/webnn-operators.md                |  2 +-
 .../webnn/builders/impl/gemm_op_builder.cc    | 23 -------------------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 295aacc6fffa3..ea88f291e5597 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | GatherElements | ai.onnx(11-12, 13+) | gatherElements | |
 | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 |
 | Gelu | ai.onnx(20+) | gelu | |
-| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input |
+| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | |
 | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input |
 | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input |
 | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 0ea927967d989..5a80f01c17236 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&,
       std::vector<int64_t> c_shape;
       if (!GetShape(*input_defs[c_idx], c_shape, logger))
         return false;
-
-      size_t c_dim = c_shape.size();
-
-      if (c_dim > 1) {
-        // TODO: Supports other shape of C.
-        // Currently WebNN implementation in Chromium only supports 1-D C.
-        return false;
-      }
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm is a scalar";
-      } else {
-        auto c_size = c_shape[c_dim - 1];
-        NodeAttrHelper helper(node);
-        const auto transB = helper.Get("transB", 0);
-        if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-          LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                                << (transB == 0 ? "1" : "0") << "]"
-                                << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                                << " c_size: " << c_size;
-
-          return false;
-        }
-      }
     }
   }
 

From 1442fe00d9169e89ec4a52bd699952810244ba6b Mon Sep 17 00:00:00 2001
From: Xiaofei Han <xiaofeihan@microsoft.com>
Date: Sat, 11 Oct 2025 11:40:56 +0800
Subject: [PATCH 04/19] fix np.testing argument order (#26128)

### Description
The argument order of np.testing was incorrect.

### Motivation and Context
Before, the expected result and the actual result are reversed.
<img width="1285" height="697" alt="image"
src="https://github.com/user-attachments/assets/0a464008-9704-46f3-a04d-912ba5b41892"
/>
---
 .../python/tools/tensorrt/perf/benchmark.py   |  2 +-
 .../test/python/onnx_backend_test_series.py   |  8 ++--
 .../test/python/onnxruntime_test_python.py    | 40 +++++++++----------
 .../python/onnxruntime_test_python_autoep.py  | 16 ++++----
 .../python/onnxruntime_test_python_backend.py |  2 +-
 .../onnxruntime_test_python_backend_mlops.py  |  8 ++--
 .../onnxruntime_test_python_cudagraph.py      | 20 +++++-----
 .../onnxruntime_test_python_dmlgraph.py       | 18 ++++-----
 .../python/onnxruntime_test_python_mlops.py   | 14 +++----
 ...me_test_python_nv_tensorrt_rtx_ep_tests.py |  6 +--
 .../test/python/quantization/test_fusions.py  |  2 +-
 .../quantization/test_qdq_loss_debug.py       |  2 +-
 .../test_quantizeblockwise_bnb4.py            |  4 +-
 .../custom_op_test_local_function.py          |  2 +-
 14 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index d6b39a6b2aeb4..66ab0c44f8814 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
             for ref_o, o in zip(ref_output, output, strict=False):
                 # abs(desired-actual) < rtol * abs(desired) + atol
                 try:
-                    np.testing.assert_allclose(ref_o, o, rtol, atol)
+                    np.testing.assert_allclose(o, ref_o, rtol, atol)
                 except Exception as e:
                     if percentage_in_allowed_threshold(e, percent_mismatch):
                         continue
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 72c6a5664f395..d2e9557f633b0 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None
         """
 
         def assert_similar_array(ref_output, output):
-            np.testing.assert_equal(ref_output.dtype, output.dtype)
+            np.testing.assert_equal(output.dtype, ref_output.dtype)
             if ref_output.dtype == object:
-                np.testing.assert_array_equal(ref_output, output)
+                np.testing.assert_array_equal(output, ref_output)
             else:
-                np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol)
+                np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol)
 
-        np.testing.assert_equal(len(ref_outputs), len(outputs))
+        np.testing.assert_equal(len(outputs), len(ref_outputs))
         for i in range(len(outputs)):  # pylint: disable=consider-using-enumerate
             if isinstance(outputs[i], list):
                 for j in range(len(outputs[i])):
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e44adcdb9827f..7f003453add89 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -54,7 +54,7 @@ def run_model(self, session_object, run_options):
         input_name = session_object.get_inputs()[0].name
         res = session_object.run([], {input_name: x}, run_options=run_options)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue):
         for _ in range(iter_num):
@@ -714,7 +714,7 @@ def test_run_model(self):
 
         res = sess.run([outputs[0].name], {inputs[0].name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_async(self):
         event = threading.Event()
@@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None:
             self.assertEqual(len(err), 0)
             self.assertEqual(len(res), 1)
             self.assertEqual(data.get_id(), 123456)
-            np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+            np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
             event.set()
 
         so = onnxrt.SessionOptions()
@@ -762,7 +762,7 @@ def test_run_model_from_bytes(self):
         self.assertEqual(output_shape, [3, 2])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -777,7 +777,7 @@ def test_run_model2(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2_contiguous(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -792,10 +792,10 @@ def test_run_model2_contiguous(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         xcontiguous = np.ascontiguousarray(x)
         rescontiguous = sess.run([output_name], {input_name: xcontiguous})
-        np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_multiple_threads(self):
         # Skip this test for a "pure" DML onnxruntime python wheel.
@@ -860,14 +860,14 @@ def test_list_as_input(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x.tolist()})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_string_list_as_input(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
         x_name = sess.get_inputs()[0].name
         res = sess.run([], {x_name: x.tolist()})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_run_device(self):
         device = onnxrt.get_device()
@@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self):
         self.assertEqual(output_shape, ["None", 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_boolean_inputs(self):
         sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers)
@@ -920,7 +920,7 @@ def test_boolean_inputs(self):
 
         output_expected = np.array([[True, False], [False, False]], dtype=bool)
         res = sess.run([output_name], {a_name: a, b_name: b})
-        np.testing.assert_equal(output_expected, res[0])
+        np.testing.assert_equal(res[0], output_expected)
 
     def test_string_input1(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -941,7 +941,7 @@ def test_string_input1(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_string_input2(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -962,7 +962,7 @@ def test_string_input2(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_bytes(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -983,7 +983,7 @@ def test_input_bytes(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0].astype("|S8"))
+        np.testing.assert_equal(res[0].astype("|S8"), x)
 
     def test_input_object(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1004,7 +1004,7 @@ def test_input_object(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_void(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1029,7 +1029,7 @@ def test_input_void(self):
         res = sess.run([output_name], {x_name: x})
 
         expr = np.array([["must", "have"], ["same", "size"]], dtype=object)
-        np.testing.assert_equal(expr, res[0])
+        np.testing.assert_equal(res[0], expr)
 
     def test_raise_wrong_num_inputs(self):
         with self.assertRaises(ValueError) as context:
@@ -1164,7 +1164,7 @@ def test_sequence_construct(self):
             },
         )
 
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_sequence_insert(self):
         opt = onnxrt.SessionOptions()
@@ -1194,7 +1194,7 @@ def test_sequence_insert(self):
                 "input_seq": [],
             },
         )
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_ort_execution_mode(self):
         opt = onnxrt.SessionOptions()
@@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self):
         input_1 = np.zeros((3, 5)).astype(np.float32)
         res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1})
         output_expected = np.ones((3, 5)).astype(np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Create an alias of SessionOptions instance
         # We will use this alias to construct another InferenceSession
@@ -1969,7 +1969,7 @@ def test_adater_export_read(self):
             self.assertTrue(value.is_tensor())
             self.assertEqual(expected_val.element_type(), value.element_type())
             self.assertEqual(expected_val.shape(), value.shape())
-            np.testing.assert_allclose(expected_val.numpy(), value.numpy())
+            np.testing.assert_allclose(value.numpy(), expected_val.numpy())
 
     def test_run_with_adapter(self):
         model_path = get_name("lora/two_params_lora_model.onnx")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index d66951bd66f3d..a24269a312e9b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -146,7 +146,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self):
         gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57)
         # copy back to CPU
         cpu_data = gpu_value.numpy()
-        np.testing.assert_equal(data, cpu_data)
+        np.testing.assert_equal(cpu_data, data)
 
         gpu_value.update_inplace(data2)  # update the fake GPU data
         cpu_data_2 = gpu_value.numpy()  # copy back to CPU
-        np.testing.assert_equal(data2, cpu_data_2)
+        np.testing.assert_equal(cpu_data_2, data2)
 
         gpu_value = None  # Delete OrtValue before unregistering library as the allocator will be destroyed.
 
@@ -336,8 +336,8 @@ def test_copy_tensors(self):
         del b_device
 
         # Verify the contents
-        np.testing.assert_array_equal(a, a_cpu_copy.numpy())
-        np.testing.assert_array_equal(b, b_cpu_copy.numpy())
+        np.testing.assert_array_equal(a_cpu_copy.numpy(), a)
+        np.testing.assert_array_equal(b_cpu_copy.numpy(), b)
 
         self.unregister_execution_provider_library(ep_name)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 6ed7dfe59b1f6..416d9b6edecd1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -19,7 +19,7 @@ def test_run_model(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index c245699e211d4..9e3c1acbc923b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
     for i in range(num_rows):
         # use np.testing.assert_allclose so we can specify the tolerance
         np.testing.assert_allclose(
-            [expected_rows[i][key] for key in sorted_keys],
             [actual_rows[i][key] for key in sorted_keys],
+            [expected_rows[i][key] for key in sorted_keys],
             rtol=1e-05,
             atol=1e-07,
         )
@@ -37,7 +37,7 @@ def test_run_model_non_tensor(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = rep.run(x)
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_proto(self):
         name = datasets.get_example("logreg_iris.onnx")
@@ -47,7 +47,7 @@ def test_run_model_proto(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
@@ -72,7 +72,7 @@ def test_run_model_proto_api(self):
         outputs = ort_backend.run_model(model, inputs)
 
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index 5ab2fe8939f6a..d6c1dd9cff3f3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_cuda_graph(self):
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
@@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers):
         # One regular run for the necessary memory allocation and cuda graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, CUDA graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay CUDA graph
         x_ortvalue.update_inplace(
@@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
     def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
@@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self):
             session.run_with_iobinding(io_binding)
             output = cuda_graph_helper.get_output("softmaxout_1")
 
-            np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02)
+            np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
index 033eae1cb4c8d..4a6aa7b63d9c3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_dml_graph(self):
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
@@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers):
         # One regular run for the necessary memory allocation and dml graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, DML graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay DML graph
         x_ortvalue.update_inplace(
@@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index 8b6b029c57752..70b8c0fc0b980 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -80,7 +80,7 @@ def test_dict_vectorizer(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         xwrong = x.copy()
         xwrong["a"] = 5.6
@@ -96,17 +96,17 @@ def test_dict_vectorizer(self):
         x = {np.int64(k): np.float32(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int64(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int32(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_label_encoder(self):
         sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
@@ -127,18 +127,18 @@ def test_label_encoder(self):
         x = np.array([["4"]])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[3]], dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Python type
         x = np.array(["4"], ndmin=2)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = np.array(["4"], ndmin=2, dtype=object)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_mlnet(self):
         available_providers = onnxrt.get_available_providers()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
index d5c80a4a1f4ba..034f0288e2508 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
@@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         """
@@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self):
         """
@@ -152,7 +152,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_bind_input_only(self):
         """
diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py
index bea110e566fb9..f02f4da4eb0fb 100644
--- a/onnxruntime/test/python/quantization/test_fusions.py
+++ b/onnxruntime/test/python/quantization/test_fusions.py
@@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e
         for idx, expected_output in enumerate(orig_results):
             actual_output = fused_results[idx]
             np.testing.assert_allclose(
-                expected_output,
                 actual_output,
+                expected_output,
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"Fused model output {idx} differs",
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index 5d70641547eae..20b40fc157c16 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self):
             for expected, actual in zip(model_outputs, test_outputs, strict=False):
                 exp = expected.reshape(-1)
                 act = actual.reshape(-1)
-                np.testing.assert_equal(exp, act)
+                np.testing.assert_equal(act, exp)
 
     def test_create_activation_matching_present(self):
         float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx")
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
index a8f7591186766..906bf7aab8698 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
@@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self):
                         matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type)
                         quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type)
                         quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type)
-                        np.testing.assert_allclose(quant_value_ref, quant_value)
-                        np.testing.assert_allclose(absmax_ref, absmax)
+                        np.testing.assert_allclose(quant_value, quant_value_ref)
+                        np.testing.assert_allclose(absmax, absmax_ref)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
index 7916d93c3e531..1dedc475c9962 100644
--- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
@@ -40,7 +40,7 @@ def test_basic_all(self):
         x = np.arange(2**2).reshape((2,) * 2).astype(np.float32)
         t = np.arange(8).reshape((2, 4)).astype(np.float32)
         got = sess.run(None, {"X": x})[0]
-        np.testing.assert_allclose(t, got, atol=1e-5)
+        np.testing.assert_allclose(got, t, atol=1e-5)
 
 
 if __name__ == "__main__":

From dde2fefe9deff01c3f25a57f1a027443af68f734 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 11 Oct 2025 10:16:52 +0000
Subject: [PATCH 05/19] Bump vite from 6.3.5 to 6.3.6 in
 /js/web/test/e2e/exports/testcases/vite-default (#26000)

---
 .../e2e/exports/testcases/vite-default/package-lock.json  | 8 ++++----
 .../test/e2e/exports/testcases/vite-default/package.json  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 48f0a8f3e9d5c..e880f6bca2ac4 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.3.5"
+        "vite": "^6.3.6"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1114,9 +1114,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.3.5",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz",
-      "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==",
+      "version": "6.3.6",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz",
+      "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index f7d5751354905..84013e2aecb88 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.3.5"
+    "vite": "^6.3.6"
   }
 }

From 41b238edee2cd78f27ef5e5a67b3d380b2c1d2e5 Mon Sep 17 00:00:00 2001
From: David Fan <30608893+jiafatom@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:04:15 -0700
Subject: [PATCH 06/19] Fix Memory Issue sparse_attention Rotary (#26278)

### Description
From an internal user, we see that sparse attention has similar memory
issue of https://github.com/microsoft/onnxruntime/pull/22290/
So we follow that PR to make the change.

### Motivation and Context
SparseAttention memory issue.
---
 .../contrib_ops/cpu/sparse/sparse_attention.cc | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
index 469084e7b4491..c51fc1cf54815 100644
--- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
@@ -130,6 +130,11 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
         allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V));
   }
 
+  OrtValue RotaryQKV;
+  OrtValue RotaryQ;
+  OrtValue RotaryK;
+  T* q_rotary = Q.GetMutable<Tensor>()->MutableData<T>();
+  T* k_rotary = packed_qkv ? nullptr : K.GetMutable<Tensor>()->MutableData<T>();
   if (do_rotary_) {
     rotary_embedding_helper::RotaryParameters rotary_params = {};
     rotary_params.batch_size = batch_size;
@@ -167,30 +172,22 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
     const T* q_input;
     const T* k_input;
-    T* q_rotary;
-    T* k_rotary;
     if (packed_qkv) {
-      OrtValue RotaryQKV;
       TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = q_input + num_heads_ * sequence_length * head_size;
       q_rotary = RotaryQKV.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = q_rotary + num_heads_ * sequence_length * head_size;
-      Q = RotaryQKV;
     } else {
-      OrtValue RotaryQ;
       TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ);
-      OrtValue RotaryK;
       TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = K.Get<Tensor>().Data<T>();
       q_rotary = RotaryQ.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = RotaryK.GetMutable<Tensor>()->MutableData<T>();
-      Q = RotaryQ;
-      K = RotaryK;
     }
 
     ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, q_input,
@@ -221,9 +218,8 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
   // Compute the attention score and apply the score to V
-  return ApplyAttention(Q.Get<Tensor>().Data<T>(), packed_qkv ? nullptr : K.Get<Tensor>().Data<T>(),
-                        packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(), past_key, past_value,
-                        output, present_key, present_value,
+  return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(),
+                        past_key, past_value, output, present_key, present_value,
                         total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context);
 }
 }  // namespace contrib

From 826a1e624082afc79c39e9e64ffde9e5be4fb62b Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 13 Oct 2025 10:29:39 -0700
Subject: [PATCH 07/19] Add windows server to supported OS (#26275)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add windows server to supported list to avoid confusing users:

Marketing Name | Internal Version | platform.release().lower() | Release
Year | Based on
-- | -- | -- | -- | --
Windows Server 2025 | 10.0.26100+ | "2025server" | 2024–2025 | Windows
11 (24H2)
Windows Server 2022 | 10.0.20348 | "2022server" | 2021 | Windows 10
(21H2)
Windows Server 2019 | 10.0.17763 | "2019server" | 2018 | Windows 10
(1809)
Windows Server 2016 | 10.0.14393 | "2016server" | 2016 | Windows 10
(1607)
---
 onnxruntime/python/onnxruntime_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 4a72916d3e485..6912d19897d67 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -23,9 +23,9 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ not in ["10", "11"]:
+        if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
             warnings.warn(
-                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
             )
     elif __my_system__ == "linux":
         """Although the 'platform' python module for getting Distro information works well on standard OS images

From 96f56ba000231e8566b2249367f5d3223108b44f Mon Sep 17 00:00:00 2001
From: Jonathan Clohessy <jonathan.clohessy@arm.com>
Date: Mon, 13 Oct 2025 19:44:51 +0100
Subject: [PATCH 08/19] Reworked sgemm_kleidi memory allocations to reuse
 memory buffers (#26166)

### **Key changes**
This PR makes changes to KleidiAI integration within the existing
sgemm_kleidiai.cpp implementation.

It was noted that during internal testing that memory allocation
overhead due to repeated allocations of vectors was having a negative
impact on performance figures.

The changes introduce thread local buffers for reusing memory during
inference.

Android platforms are particularly sensitive to this, we have observed
inference times being significantly impacted due to memory allocation
overheads
### Example performance
All runs were captured using onnxruntime_perf_test
e.g. onnxruntime_perf_test -v -e cpu -I -m times -x 1 -y 1 -r 1000
**Android Platform**
<img width="996" height="286" alt="image"
src="https://github.com/user-attachments/assets/252165af-c864-4b24-b1f2-c28ada208b06"
/>

In addition to this on M4 we have also observed slight improvements on
models, however its the gain is not as significant as the allocation
overhead is lower in terms of total time on that platform

**Mac Mini M4**
<img width="741" height="153" alt="image"
src="https://github.com/user-attachments/assets/93e6c545-96fd-4bfc-b90f-3a845a1551bc"
/>

**Onnxruntime Mlas Benchmark**
Mlas Benchmark was executed on a Mac Mini M4 with SME2 instructions
Tested code with and without changes in pr and observed the following
results (subset shown) comparison generated using compare.py located in
google benchmark repo tools
`./onnxruntime_mlas_benchmark --benchmark_filter="SGEMM/NORMAL*"
--benchmark_repetitions=100`

```

Benchmark                                                             Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------------------------------------------
SGEMM/NORMAL_NoTrans/M:63/N:63/K:63/real_time                      -0.1897         -0.1897          3270          2650          3270          2650
SGEMM/NORMAL_NoTrans/M:255/N:63/K:63/real_time                     -0.1468         -0.1469          8383          7152          8382          7151
SGEMM/NORMAL_NoTrans/M:1023/N:63/K:63/real_time                    -0.1506         -0.1506         19072         16200         19072         16200
SGEMM/NORMAL_NoTrans/M:63/N:255/K:63/real_time                     -0.1957         -0.1957          7742          6227          7742          6227
SGEMM/NORMAL_NoTrans/M:255/N:255/K:63/real_time                    -0.1032         -0.1032         14323         12845         14322         12845
SGEMM/NORMAL_TransB/M:63/N:63/K:63/real_time                       -0.2221         -0.2221          3356          2611          3356          2610
SGEMM/NORMAL_TransB/M:255/N:63/K:63/real_time                      -0.0439         -0.0438          8602          8224          8601          8224
SGEMM/NORMAL_TransB/M:1023/N:63/K:63/real_time                     +0.0436         +0.0436         16488         17206         16487         17206
SGEMM/NORMAL_TransB/M:63/N:255/K:63/real_time                      -0.2000         -0.1999          8046          6437          8046          6437
SGEMM/NORMAL_TransB/M:255/N:255/K:63/real_time                     -0.0979         -0.0979         14131         12747         14130         12747
SGEMM/NORMAL_TransB/M:1023/N:255/K:63/real_time                    -0.2836         -0.2836         62540         44802         62540         44802
SGEMM/NORMAL_TransB/M:63/N:1023/K:63/real_time                     -0.2183         -0.2183         15342         11993         15342
```

Some small regressions have been seen but are difficult to explain,
suspected machine variance during run could account for things like
```
SGEMM/NORMAL_TransB/M:1023/N:63/K:63/real_time                     +0.0436         +0.0436         16488         17206         16487         17206
```
For example, as part of testing these results sgemm_kleidi.cpp was
instrumented (after the previous benchmark results) with timer code, in
MlasGemmBatch, MlasGemmPackB, and MlasGemmPackBSize.
Which produced the following, indicating that the code performs better
in this case on average than baseline which is currently in main
```
Head of main
Function           Count         Avg (ns)     Avg (pretty)
----------------------------------------------------------
MlasGemmBatch      42664        19601.015     19.601 us
MlasGemmPackB      42664          373.943    373.943 ns
MlasGemmPackBSize  42664           17.179     17.179 ns

TLB changes
Function           Count         Avg (ns)     Avg (pretty)
----------------------------------------------------------
MlasGemmBatch      55492        16985.256     16.985 us
MlasGemmPackB      55492          344.800    344.800 ns
MlasGemmPackBSize  55492           16.788     16.788 ns
```

---------

Signed-off-by: Jonathan Clohessy <Jonathan.Clohessy@arm.com>
---
 .../core/mlas/lib/kleidiai/mlasi_kleidiai.h   |  34 ++++++
 .../core/mlas/lib/kleidiai/sgemm_kleidiai.cpp | 100 +++++++++++-------
 2 files changed, 96 insertions(+), 38 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index 5136061c4769d..2e9c4574fd057 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -115,3 +115,37 @@ MlasConv(
     MLAS_THREADPOOL* ThreadPool
     );
 }
+
+/*++
+
+Routine Description:
+
+    This routine determines if a wraparound will occur when multiplying two size_t variables
+    Uses __builtin_mul_overflow if available on the current system and if not falls back
+    to a default implementation to check this wraparound.
+
+Arguments:
+
+    a - Supplies the first number to be muliplied.
+
+    b - Supplies the second number to be muliplied.
+
+    out - pointer to a size_t which acts as the return value in success cases.
+
+Return Value:
+
+    Returns false if the operation was successful
+    Returns true if wraparound of size_t was detected
+
+--*/
+inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) {
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_mul_overflow)
+    return __builtin_mul_overflow(a, b, out);
+#  endif
+#endif
+    // Fallback to manual check if builtin not available
+    if (b != 0 && a > SIZE_MAX / b) return true;
+    if (out) *out = a * b;
+    return false;
+}
diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
index ea38f16205a7c..435ff1fb10017 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
@@ -14,6 +14,16 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h"
 #include "mlasi_kleidiai.h"
 
+
+// Thread-local reusable buffers to reduce allocation overhead across tiles.
+struct KaiTlsBuffers {
+    std::vector<float> output_tile;
+    std::vector<float> bias_zero;
+    std::vector<std::byte> rhs_packed;
+    std::vector<std::byte> lhs_packed;
+};
+static thread_local KaiTlsBuffers g_kai_tls;
+
 size_t
 MLASCALL
 ArmKleidiAI::MlasGemmPackBSize(
@@ -51,7 +61,6 @@ Return Value:
     // Compute the number of bytes required to hold the packed buffer.
     //
     size_t bytes = 0;
-
     if (TransA == CblasNoTrans) {
         switch (TransB) {
             case CblasNoTrans:
@@ -125,15 +134,15 @@ Return Value:
         const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                                   : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-        // pass zeroed bias values
-        const std::vector<float> bias(N);
+        // Ensure size and zero the used span.
+        g_kai_tls.bias_zero.resize(N, 0.0f);
 
         switch (TransB) {
             case CblasNoTrans:
-                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             case CblasTrans:
-                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             default:
                 return false;
@@ -225,22 +234,29 @@ Return Value:
     size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                             : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-    if (M < m_step && N < n_step && !Data->BIsPacked) {
+    if ((M < m_step || N < n_step) && !Data->BIsPacked) {
         // Fallback to MLAS
         return false;
     }
 
-    std::vector<MLAS_SGEMM_DATA_PARAMS> KaiPackedData;
-    KaiPackedData.resize(BatchSize);
-
     size_t LhsPackedStride = 0;
     std::byte* LhsPackedData = nullptr;
 
     LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr);
-    auto LhsPacked = std::make_unique<std::byte[]>(LhsPackedStride * BatchSize);
-    LhsPackedData = LhsPacked.get();
 
-    std::unique_ptr<std::byte[]> RhsPacked{nullptr};
+    size_t lhs_resize = 0;
+    if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize))
+    {
+        // size_t wraparound detected for LhsPackedStride, fallback to MLAS
+        return false;
+    }
+
+    g_kai_tls.lhs_packed.resize(lhs_resize);
+    LhsPackedData = g_kai_tls.lhs_packed.data();
+
+    // RHS packed buffer: use TLS reusable vector to minimize allocations
+    size_t RhsPackedStride = 0;
+    std::byte* RhsPackedData = nullptr;
 
     // It is assumed all B batches require packing or not
     if (Data[0].BIsPacked) {
@@ -248,36 +264,31 @@ Return Value:
         MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) {
             std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
             kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-            KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
-            KaiPackedData[batch_idx].B = Data[batch_idx].B;
         });
     } else {
         // Multithread pack lhs and rhs
-        size_t RhsPackedStride = 0;
-        std::byte* RhsPackedData = nullptr;
-
         RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K);
-        RhsPacked = std::make_unique<std::byte[]>(RhsPackedStride * BatchSize);
-        RhsPackedData = RhsPacked.get();
+        size_t rhs_resize = 0;
+        if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize))
+        {
+            // size_t wraparound detected for RhsPackedStride, fallback to MLAS
+            return false;
+        }
+
+        g_kai_tls.rhs_packed.resize(rhs_resize);
+        RhsPackedData = g_kai_tls.rhs_packed.data();
 
         MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) {
-            // lhs odd, rhs even
             if (batch_idx & 0x1) {
                 batch_idx >>= 1;
-
                 std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
-
                 kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-
-                KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
             } else {
                 batch_idx >>= 1;
-
                 std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]);
-
-                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast<const float*>(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr);
-
-                KaiPackedData[batch_idx].B = reinterpret_cast<const float*>(RhsPackedPtr);
+                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K,
+                                           reinterpret_cast<const float*>(Data[batch_idx].B),
+                                           Data[batch_idx].ldb, RhsPackedPtr);
             }
         });
     }
@@ -303,6 +314,14 @@ Return Value:
     dim[1] = MlasDivRoundup(M, m_step);
     dim[2] = MlasDivRoundup(N, n_step);
 
+    // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop.
+    // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively.
+    size_t max_tile_elems = 0;
+    if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) {
+        // size_t wraparound detected for tile size, fallback to MLAS
+        return false;
+    }
+
     MlasTrySimpleParallel(ThreadPool, static_cast<ptrdiff_t>(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) {
         // compute B,M,N index from iteration index
         ptrdiff_t BIdx = tid / (dim[1] * dim[2]);
@@ -314,18 +333,18 @@ Return Value:
             UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K)
                     : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K);
 
-        auto BTile = reinterpret_cast<const void*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].B) + rhs_packed_offset
-        );
+        const std::byte* B_base = Data[0].BIsPacked
+            ? reinterpret_cast<const std::byte*>(Data[BIdx].B)
+            : (RhsPackedData + RhsPackedStride * BIdx);
+        auto BTile = reinterpret_cast<const void*>(B_base + rhs_packed_offset);
 
         // Get lhs tile, A
         const size_t lhs_packed_offset =
             UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K)
                     : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K);
 
-        auto ATile = reinterpret_cast<const float*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].A) + lhs_packed_offset
-        );
+        const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx;
+        auto ATile = reinterpret_cast<const float*>(A_base + lhs_packed_offset);
 
         auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step;
         auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step;
@@ -336,9 +355,14 @@ Return Value:
             MIdx * m_step * Data[BIdx].ldc * sizeof(float) +
             NIdx * n_step * sizeof(float)
         );
-        // Allocate temporary buffer for raw A*B result
-        std::vector<float> OutputTile(TileSizeM * TileSizeN, 0.0f);
-        float* temp_tile = OutputTile.data();
+        // Allocate temporary buffer for raw A*B result (TLS reusable buffer)
+        size_t tile_elems = TileSizeM * TileSizeN;
+
+        // resize the tile to the required size
+        g_kai_tls.output_tile.resize(tile_elems);
+
+        float* temp_tile = g_kai_tls.output_tile.data();
+        std::fill_n(temp_tile, tile_elems, 0.0f);
 
         if (UseSME2) {
             kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(

From 311b4a647690ea4ee53dc6cb99cc9b9b1d1c6218 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 13 Oct 2025 14:16:31 -0700
Subject: [PATCH 09/19] [CUDA] Upgrade build pipelines to use CUDA 12.8 + cuDNN
 9.8 (#26267)

This upgrades CUDA 12.2 + cuDNN 9.5 to CUDA 12.8 + cuDNN 9.8 in CI
pipelines, so that we can build 120-real to support Blackwell GPU.

To speed up build, we also disable relocatable-device-code.

MSVC is updated to latest for some windows build pipelines.

#### Known issues

Some onnx models (yolo v3, yolo v4, mobilenet v1) failed to run due to
cudnn frontend failed to find engine plan. We will try upgrade cudnn
frontend later. Related failed tests are disabled for now.

---------

Co-authored-by: Changming Sun <chasun@microsoft.com>
---
 .github/workflows/publish-csharp-apidocs.yml  |  2 +-
 .github/workflows/windows_cuda.yml            | 26 +++++++++----------
 .github/workflows/windows_openvino.yml        |  2 +-
 .github/workflows/windows_qnn_x64.yml         |  2 +-
 .github/workflows/windows_tensorrt.yml        | 26 +++++++++----------
 .../windows_x64_debug_build_x64_debug.yml     |  2 +-
 .../windows_x64_release_build_x64_release.yml |  2 +-
 ...build_x64_release_ep_generic_interface.yml |  2 +-
 ..._x64_release_vitisai_build_x64_release.yml |  2 +-
 .../workflows/windows_x64_release_xnnpack.yml |  2 +-
 .github/workflows/windows_x86.yml             |  2 +-
 cmake/onnxruntime_providers_cuda.cmake        |  4 +--
 .../gather_block_quantized_op_test.cc         |  2 +-
 onnxruntime/test/providers/cpu/model_tests.cc |  9 ++++++-
 .../test/providers/cpu/tensor/cast_op_test.cc |  2 +-
 .../build-perf-test-binaries-pipeline.yml     |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  6 ++---
 .../c-api-noopenmp-test-pipelines.yml         | 12 ++++-----
 .../cuda-packaging-pipeline.yml               | 10 +++----
 .../custom-nuget-packaging-pipeline.yml       | 18 +++----------
 .../azure-pipelines/jar_package_testing.yml   |  4 +--
 ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 12 ++++-----
 .../nuget/templates/test_linux.yml            |  6 ++---
 .../azure-pipelines/post-merge-jobs.yml       |  6 ++---
 .../py-cuda-package-test-pipeline.yml         |  4 +--
 .../py-cuda-packaging-pipeline.yml            |  2 +-
 .../jobs/py-linux-cuda-package-test-job.yml   | 14 +++++-----
 .../stages/nodejs-linux-packaging-stage.yml   |  6 ++---
 .../nuget-linux-cuda-packaging-stage.yml      | 12 ++++-----
 .../stages/py-gpu-packaging-stage.yml         |  6 ++---
 .../stages/py-linux-gpu-stage.yml             |  6 ++---
 .../stages/py-win-gpu-stage.yml               | 10 +++----
 .../jobs/download_win_gpu_library.yml         |  8 +++---
 .../templates/jobs/set-winenv.yml             |  2 +-
 .../py-packaging-linux-test-cuda.yml          |  6 ++---
 .../azure-pipelines/templates/win-ci.yml      |  4 +--
 .../win-gpu-doc-gen-ci-pipeline.yml           |  4 +--
 ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml |  6 ++---
 .../linux/docker/Dockerfile.manylinux2_28_cpu |  2 +-
 .../docker/Dockerfile.manylinux2_28_rocm      |  2 +-
 .../docker/Dockerfile.manylinux2_28_webgpu    |  2 +-
 .../docker/Dockerfile.package_ubuntu_2004_gpu |  2 +-
 .../inference/aarch64/default/cpu/Dockerfile  |  2 +-
 .../inference/aarch64/python/cpu/Dockerfile   |  2 +-
 .../inference/x86_64/default/cpu/Dockerfile   |  2 +-
 .../x86_64/default/cuda12/Dockerfile          |  2 +-
 .../inference/x86_64/python/cpu/Dockerfile    |  2 +-
 .../x86_64/python/openvino/Dockerfile         |  2 +-
 .../github/windows/setup_env_cuda.bat         |  8 +++---
 .../ci_build/github/windows/setup_env_gpu.bat |  8 +++---
 .../ci_build/github/windows/setup_env_trt.bat |  6 ++---
 51 files changed, 145 insertions(+), 150 deletions(-)

diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 42d1bdc295785..683c5594e82f2 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -20,7 +20,7 @@ permissions:
 
 jobs:
   build:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     env:
       DOCFXVERSION: 2.62.2
     steps:
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 437fc0e2c6334..3d24d4b6b75b6 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU CUDA CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -52,9 +52,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - uses: actions/setup-node@v5
         with:
@@ -111,7 +111,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -188,10 +188,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -199,9 +199,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - name: Set OnnxRuntimeBuildDirectory
         shell: pwsh
@@ -227,7 +227,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml
index 395ccfbe70244..b608c0879aa45 100644
--- a/.github/workflows/windows_openvino.yml
+++ b/.github/workflows/windows_openvino.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   BUILD_OPENVINO_EP:
     name: Windows OpenVINO CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 240
     env:
       AZCOPY_AUTO_LOGIN_TYPE: MSI
diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml
index 9788792b94fa8..1906fcb18c841 100644
--- a/.github/workflows/windows_qnn_x64.yml
+++ b/.github/workflows/windows_qnn_x64.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   build_test_qnn_ep:
     name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 120
     strategy:
       matrix: 
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 5f3dcb9607a47..2a1fe97d9b7b7 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU TensorRT CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -56,9 +56,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - uses: actions/setup-node@v5
@@ -116,7 +116,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -193,10 +193,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -208,9 +208,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - name: Set OnnxRuntimeBuildDirectory
@@ -237,7 +237,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml
index 6165375e7a54a..6a1b43e54ed89 100644
--- a/.github/workflows/windows_x64_debug_build_x64_debug.yml
+++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_debug:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml
index f9d7b0d9e9e04..0bcd282e8dc50 100644
--- a/.github/workflows/windows_x64_release_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
index 54c13e1e04b0a..3934047266f59 100644
--- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
+++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_ep_generic_interface:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
index 06230962b39be..1c38d8e58970c 100644
--- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_vitisai:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml
index 21033ef4cbe3c..6eb9f00d3997d 100644
--- a/.github/workflows/windows_x64_release_xnnpack.yml
+++ b/.github/workflows/windows_x64_release_xnnpack.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_xnnpack:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml
index fa1e9362e2f34..597c1c7f4b6cf 100644
--- a/.github/workflows/windows_x86.yml
+++ b/.github/workflows/windows_x86.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x86_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 68a3e9014b7b0..1d31eb1fbd207 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -182,8 +182,8 @@
 
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
-      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--relocatable-device-code=true>")
-      set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>")
       endif()
diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 574ec49da67ea..3bf37ea193245 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector<T>& data, const std::vector<int64_t>& s
 
   ORT_ENFORCE(static_cast<int64_t>(data.size()) == total_elements, "Data size does not match the shape",
               "Data size: ", data.size(), ", Expected size: ", total_elements,
-              ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name());
+              ", Shape: ", VectorToString(shape), " Name:", name);
 }
 
 // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index cf49601e6c671..ca1a3104e0bed 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -678,7 +678,14 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
                                                      ORT_TSTR("fp16_coreml_FNS-Candy"),
                                                      ORT_TSTR("fp16_test_tiny_yolov2"),
                                                      ORT_TSTR("fp16_test_shufflenet"),
-                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
+                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"),
+                                                     // models from model zoo. #26274: cuDNN frontend no valid engine
+                                                     ORT_TSTR("YOLOv3"),
+                                                     ORT_TSTR("YOLOv3-12"),
+                                                     ORT_TSTR("YOLOv4"),
+                                                     ORT_TSTR("SSD-MobilenetV1"),
+                                                     ORT_TSTR("SSD-MobilenetV1-12")};
+
   // For ROCm EP, also disable the following tests due to flakiness,
   // mainly with precision issue and random memory access fault.
   static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
index 8f4c4ff0896ba..289e94397fb39 100644
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -1477,7 +1477,7 @@ template <typename F4>
 void CastOpTestFloatFloat4(std::vector<int64_t> shape,
                            std::vector<float> float_data,
                            bool is_fp4_input = false) {
-  size_t num_pairs = float_data.size() / 2;
+  int num_pairs = static_cast<int>(float_data.size()) / 2;
   int num_fp4_elements = static_cast<int>((float_data.size() + 1) / 2);
   bool is_odd_count = (float_data.size() % 2 != 0);
 
diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 53b62762319ba..e54216fe4ef4e 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -31,5 +31,5 @@ stages:
         machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU'
         extra_build_arg: ''
         cmake_build_type: Release
-        cuda_version: 12.2
+        cuda_version: 12.8
         docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 91736752e22d4..086d65c93062b 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -73,12 +73,12 @@ variables:
 - name: ReleaseVersionSuffix
   value: ''
 - name: win_trt_version
-  value: 12.2
+  value: 12.8
 
 - name: win_trt_home
   value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
-  value: $(Agent.TempDirectory)\v12.2
+  value: $(Agent.TempDirectory)\v12.8
 extends:
   # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
   # For non-production pipelines, use "Unofficial" as defined below.
@@ -142,7 +142,7 @@ extends:
 
     - template: stages/nuget-combine-cuda-stage.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
         win_trt_home: ${{ variables.win_trt_home }}
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 46363c07b3e3e..7e107c33ed8c0 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -127,7 +127,7 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_win.yml
   parameters:
@@ -136,7 +136,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -144,7 +144,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -153,7 +153,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 
@@ -202,7 +202,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
@@ -257,7 +257,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 5535d7b4f264d..d7fc0efbf45ea 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -48,9 +48,9 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-  - 12.2
+  - 12.8
 
 variables:
 - template: templates/common-variables.yml
@@ -59,13 +59,13 @@ variables:
 - name: win_trt_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }}
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\v11.8
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
-    value: $(Agent.TempDirectory)\v12.2
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
+    value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index 1ad6f411d9848..5ce6ec278b1e7 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 - name: QnnSdk
   displayName: QNN SDK Version
@@ -40,8 +40,8 @@ variables:
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
@@ -178,9 +178,6 @@ extends:
             inputs:
               targetType: 'inline'
               script: |
-                mkdir -p $(Build.BinariesDirectory)/osx-x64
-                Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64
-
                 mkdir -p $(Build.BinariesDirectory)/osx-arm64
                 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64
 
@@ -200,12 +197,6 @@ extends:
                 foreach ($dir in $dirs) {
                   Write-Host "Directory: $($dir.FullName)"
                 }
-                $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*
-                if ($osx_x64_archive.Count -eq 0) {
-                  Write-Host "No osx-x64 archive found."
-                } else {
-                  Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)"
-                }
                 $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*
                 if ($osx_arm64_archive.Count -eq 0) {
                   Write-Host "No osx-arm64 archive found."
@@ -233,13 +224,10 @@ extends:
               script: |
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
-                $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
-                tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null
                 tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null
                 $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName
                 $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName
-                $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
                 Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
                 Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
index d387c07d6dc6e..463c02203e21a 100644
--- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
+++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
@@ -40,7 +40,7 @@ stages:
 
     - template: templates/jobs/download_win_gpu_library.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         DownloadCUDA: true
         DownloadTRT: true
 
@@ -105,7 +105,7 @@ stages:
     - name: runCodesignValidationInjection
       value: false
     - name: docker_base_image
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
     timeoutInMinutes: 60
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index 0410001d77d13..5e6671e3797ce 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,21 +31,21 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index 89ce3f3c86727..b60ef7576184e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -58,9 +58,9 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
           Context: tools/ci_build/github/linux/docker/
-          ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          ${{ if eq(parameters.CudaVersion, '12.8') }}:
             DockerBuildArgs: "
-            --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
+            --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
             --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }}
             --build-arg BUILD_UID=$( id -u )
             "
@@ -107,4 +107,4 @@ stages:
           DisableContribOps: $(DisableContribOps)
           DisableMlOps: $(DisableMlOps)
           IsReleaseBuild: $(IsReleaseBuild)
-          PACKAGENAME: ${{ parameters.NugetPackageName }}
\ No newline at end of file
+          PACKAGENAME: ${{ parameters.NugetPackageName }}
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index deb8b84bf19b8..fdfafd4d9a179 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -2,16 +2,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 stages:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index c2c89686a077e..02b6a6df76611 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,8 +18,8 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
-          cuda_version: '12.2'
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
+          cuda_version: '12.8'
 
   - stage: Republish_Wheels
     dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index 4c536bad45368..290af4a3e4449 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -49,4 +49,4 @@ extends:
       - template: stages/py-gpu-packaging-stage.yml
         parameters:
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          cuda_version: '12.2'
+          cuda_version: '12.8'
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 858de4d173484..b53aee639372d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -2,9 +2,9 @@ parameters:
   - name: CudaVersion
     displayName: 'CUDA version'
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
   - name: machine_pool
     type: string
 
@@ -44,13 +44,13 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     pool: ${{ parameters.machine_pool }}
     steps:
@@ -105,4 +105,4 @@ jobs:
         inputs:
           targetType: filePath
           filePath: tools/ci_build/github/linux/run_python_dockertest.sh
-          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2
+          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8
diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
index bca95a4a2fd02..8cbb81ba89c12 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 stages:
 - stage: Linux_Nodejs_Packaging_x64
@@ -20,14 +20,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 121e80fca1021..b1e5f541b90e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 - name: buildJava
   type: boolean
 - name: buildNodejs
@@ -22,7 +22,7 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
@@ -74,14 +74,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
@@ -140,12 +140,12 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index d3d4b8f5b64d5..3c5cf591039e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -19,9 +19,9 @@ parameters:
 - name: cuda_version
   type: string
   displayName: 'CUDA version. Windows Only.'
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 - name: PythonVersions
   type: object
@@ -48,4 +48,4 @@ stages:
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
index 715470eb9f012..ab1fb919af413 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
@@ -22,9 +22,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 stages:
 - stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
@@ -55,7 +55,7 @@ stages:
       - name: trt_version
         ${{ if eq(parameters.cuda_version, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.cuda_version, '12.2') }}:
+        ${{ if eq(parameters.cuda_version, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index e2683c04f21f2..c3957fc8341de 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -20,9 +20,9 @@ parameters:
   default: ''
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 - name: cmake_build_type
   type: string
@@ -47,7 +47,7 @@ stages:
       workspace:
         clean: all
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
       templateContext:
         sdl:
@@ -76,7 +76,7 @@ stages:
       - name: win_trt_folder
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.win_trt_folder_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.win_trt_folder_cuda12 }}
       - name: trt_build_flag
         ${{ if eq(parameters.use_tensorrt, true) }}:
@@ -119,7 +119,7 @@ stages:
             --cmake_generator "$(VSGenerator)"
             --enable_pybind
             --enable_onnx_tests
-            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40
+            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build 
             $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
           workingDirectory: '$(Build.BinariesDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 681138a5ab3d1..be213337091e8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -7,10 +7,10 @@ parameters:
     default: false
   - name: CudaVersion
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
       - 11.8
-      - 12.2
+      - 12.8
   - name: TrtVersion
     type: string
     default: '10.9.0.34'
@@ -46,11 +46,11 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 96436883fb8b8..d7c940cda30f4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -9,7 +9,7 @@ parameters:
     default: false
   - name: PrimaryCUDAVersion
     type: string
-    default: '12.2'
+    default: '12.8'
 #  - name: SecondaryCUDAVersion
 #    type: string
 #    default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 1415586521f30..263f73a9e29b0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,9 +18,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 # TODO: Ideally it should fetch information from the build that triggers it
 - name: cmake_build_type
@@ -46,7 +46,7 @@ jobs:
   - name: trt_version
     ${{ if eq(parameters.cuda_version, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
+    ${{ if eq(parameters.cuda_version, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
   workspace:
     clean: all
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 0310735d94b2e..ca698123a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -78,7 +78,7 @@ parameters:
   default: '11.8'
   values:
       - 11.8
-      - 12.2
+      - 12.8
 
 - name: SpecificArtifact
   displayName: Use Specific Artifact
@@ -136,7 +136,7 @@ stages:
 
     ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
     ${{ else }}:
       pool:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
index c20f4a2c1bd19..8b320b0ceb4ac 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
@@ -32,10 +32,10 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
     - 11.8
-    - 12.2
+    - 12.8
 
 stages:
 - stage: kernelDocumentation
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index c12bb3552920c..08953749f6527 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,16 +31,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 2a65e7c26b20b..a277286866e41 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index 3337af3be6074..5410bd64036ce 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 ARG ROCM_VERSION=6.2.3
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
index 0007a4e06f7c0..07ad8e933baf0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 8a84b9b940306..5d98c25b535af 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
 ARG TRT_VERSION=10.9.0.34-1+cuda12.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index 8b2083c2ccfc1..cef2d11780969 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index f5143d5ac9ab9..79d99d08dcc4e 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index cfc2ce7079148..72d98206f9205 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index 8401393a661b1..85f4a074e30bf 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2
 FROM $BASEIMAGE
 ARG TRT_VERSION
 
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index b923febc1227f..81ba47f397f91 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
index f3341f32a768d..5ad1023bfb5b2 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -1,5 +1,5 @@
 # Use the specified UBI8 base image with GCC 14
-ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1"
+ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2"
 FROM ${BASEIMAGE}
 
 ARG BUILD_UID=1000
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index f93938e2a9009..f095f58f9920e 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,13 +1,13 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index ecadab5d3f8a3..115a19b6f3a01 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -1,14 +1,14 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 45e0d970fb541..6110249a9cde6 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,10 +1,10 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false

From 24d7aee69dda788f0e7d78aaee8b19e579a83d20 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Mon, 13 Oct 2025 19:50:22 -0400
Subject: [PATCH 10/19] [MIGraphX EP] Link FP4 types between OnnxRT and
 MIGraphX APIs (#26231)

Do this so that MIGraphX can take in fp4 types from input/output tensors
and then use that to perform an inference via the MIGraphX API.

### Description
<!-- Describe your changes. -->
Mirroed changes going into ROCm 7.1 build. Cherry -picked mainline
OnnxRT changes to get fp4 tensor support before adding this ontop.

Moving this to mainline OnnxRt to enable the MIGraphX EP to allow for
fp4 input/output tensors
https://github.com/ROCm/onnxruntime/pull/176

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Add fp4 support to MIGraphX EP
---
 .../core/providers/migraphx/migraphx_execution_provider.cc    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index a59347841be95..239a5054801bc 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) {
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2:
@@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
       mgx_type = migraphx_shape_fp8e5m2fnuz_type;
       break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1:
+      mgx_type = migraphx_shape_fp4x2_type;
+      break;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4:
       mgx_type = migraphx_shape_int8_type;
       break;

From 4c7fc43941ab181e4a06deb7a99d0bf483d66318 Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Tue, 14 Oct 2025 00:50:38 +0100
Subject: [PATCH 11/19] Fix build.sh --parallel 1 incorrectly triggering
 parallel build (#26264)

### Description
This PR fixes an issue where running

```bash
bash build.sh ...... --parallel 1 ......
```

still triggers a parallel build.

The previous logic only added -j when num_parallel_jobs != 1, which
caused Ninja/Make/Xcode to use all CPU cores by default.

### Motivation and Context
When building ONNX Runtime, using parallel 4 caused an out-of-memory
(OOM) error in my computer. However, changing it to parallel 1 still
triggered parallel compilation and caused OOM again.
---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 54dd23b07a363..8a72ab70cc67d 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1333,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
             cmd_args.extend(["--target", *targets])
 
         build_tool_args = []
-        if num_parallel_jobs != 1:
+        if num_parallel_jobs != 0:
             if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
                 # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
                 # not maxing out CL_MPCount

From 94de31fa124c29e79c9583e3273a1a60a7e8f7b0 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 13 Oct 2025 16:52:41 -0700
Subject: [PATCH 12/19] Bump onnx to 1.19.1 (#26202)

~~Test rel-1.19.1~~

Bump to ONNX==1.19.1
---
 cmake/deps.txt                                |  2 +-
 .../external/onnxruntime_external_deps.cmake  |  8 +---
 .../onnx/avoid_regenerating_proto_files.patch | 46 -------------------
 cmake/patches/onnx/onnx.patch                 | 12 ++---
 .../onnx/avoid_regenerating_proto_files.patch | 46 -------------------
 cmake/vcpkg-ports/onnx/binskim.patch          | 12 ++---
 cmake/vcpkg-ports/onnx/portfile.cmake         |  5 +-
 cmake/vcpkg-ports/onnx/vcpkg.json             |  2 +-
 docs/How_To_Update_ONNX_Dev_Notes.md          |  2 +-
 onnxruntime/test/onnx/TestCase.cc             | 17 ++++++-
 onnxruntime/test/onnx/main.cc                 | 18 --------
 .../onnx_backend_test_series_filters.jsonc    | 29 +++---------
 .../python/cpu/scripts/requirements.txt       |  2 +-
 .../docker/scripts/lort/requirements.txt      |  2 +-
 .../docker/scripts/manylinux/requirements.txt |  2 +-
 .../linux/docker/scripts/requirements.txt     |  2 +-
 .../github/linux/python/requirements.txt      |  2 +-
 .../github/windows/python/requirements.txt    |  2 +-
 18 files changed, 45 insertions(+), 166 deletions(-)
 delete mode 100644 cmake/patches/onnx/avoid_regenerating_proto_files.patch
 delete mode 100644 cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7b243ff15cd80..bf76753c1b3c0 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index f32350ca755ea..b6a741d8b0fe7 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -498,13 +498,7 @@ else()
 endif()
 
 if(Patch_FOUND)
-  set(ONNXRUNTIME_ONNX_PATCH_COMMAND
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch &&
-      # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-      # This change should be included in ONNX 1.19.1.
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 <
-          ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch
-      )
+  set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch)
 else()
   set(ONNXRUNTIME_ONNX_PATCH_COMMAND "")
 endif()
diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index 27f5ea5fadd79..882850963a0c0 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -4,12 +4,9 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
     REF "v${VERSION}"
-    SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0
+    SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60
     PATCHES
         fix-cmakelists.patch
-        # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-        # This change should be included in ONNX 1.19.1.
-        avoid_regenerating_proto_files.patch
         fix-dependency-protobuf.patch
         binskim.patch
 )
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
index 350db2e35061a..ad0d1aaf15f51 100644
--- a/cmake/vcpkg-ports/onnx/vcpkg.json
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "onnx",
-  "version-semver": "1.19.0",
+  "version-semver": "1.19.1",
   "port-version": 1,
   "description": "Open standard for machine learning interoperability",
   "homepage": "https://onnx.ai",
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 8da19ddc51cb7..8c1280431c384 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -35,7 +35,7 @@ git add onnx
 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch).
 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version.
 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.)
-4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
+4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
 
 Alternatively, directly run Terrapin to upload ONNX package (need SHA512):
 
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 6df98ff505fa1..cbb25bb9b629e 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1435,9 +1435,22 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"});
     // Fails since ONNX==1.19.0
     broken_tests->insert({"l2normalization_axis_0", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"});
     broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"});
   }
 
 #ifdef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index b6f2cb2683677..463634b370d4c 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     // Please make no more changes to the list
     static const ORTCHAR_T* immutable_broken_tests[] =
         {
-            // pending ONNX update
-            ORT_TSTR("attention_3d_gqa"),
-            ORT_TSTR("attention_3d_gqa_attn_mask"),
-            ORT_TSTR("attention_3d_gqa_causal"),
-            ORT_TSTR("attention_3d_gqa_scaled"),
-            ORT_TSTR("attention_3d_gqa_softcap"),
-            ORT_TSTR("attention_3d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_gqa"),
-            ORT_TSTR("attention_4d_gqa_attn_mask"),
-            ORT_TSTR("attention_4d_gqa_causal"),
-            ORT_TSTR("attention_4d_gqa_scaled"),
-            ORT_TSTR("attention_4d_gqa_softcap"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"),
-            // unsupported case
             ORT_TSTR("AvgPool1d"),
             ORT_TSTR("AvgPool1d_stride"),
             ORT_TSTR("AvgPool2d"),
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index f5f6a3ae3bc39..0558d008a2275 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -31,23 +31,12 @@
     "current_failing_tests": [
         "^test_adagrad",
         "^test_adagrad_multiple",
-        "^test_attention_4d_diff_heads_mask4d_padded_kv*",  // pending onnx update
-        "^test_attention_3d_gqa*",  // pending onnx update
-        "^test_attention_3d_gqa_causal",  // pending onnx update
-        "^test_attention_3d_gqa_scaled",  // pending onnx update
-        "^test_attention_3d_gqa_softcap",  // pending onnx update
-        "^test_attention_3d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_4d_gqa*",  // pending onnx update
-        "^test_attention_4d_gqa_causal",  // pending onnx update
-        "^test_attention_4d_gqa_scaled",  // pending onnx update
-        "^test_attention_4d_gqa_softcap",  // pending onnx update
-        "^test_attention_4d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_*causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_attn_mask_3d_causal_expanded*",  // pending onnx update
         "^test_attention_4d_fp16*",  // precision issue: 1 / 192 mismatched elements
         "^test_attention_4d_fp16_expanded*",  // precision issue: 3 / 192 mismatched elements
+        "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*",  // webgpu mismatched elements 38 / 576
+        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*",  // webgpu
+        "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu
+        "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen
         "^test_l2normalization*",  // LpNormalization(22) not implemented
         "^test_l1normalization*",  // LpNormalization(22) not implemented
         "^test_lpnormalization*",  // LpNormalization(22) not implemented
@@ -123,13 +112,9 @@
         "^test_if_opt",
         "^test_loop16_seq_none",
         "^test_identity_opt",
-        // rotary dim should be fixed in onnx==1.19.1
-        "^test_rotary_embedding_no_position_ids_rotary_dim",
-        "^test_rotary_embedding_with_interleaved_rotary_dim",
-        "^test_rotary_embedding_with_rotary_dim",
-        "^test_rotary_embedding_3d_input_expanded",
-        "^test_rotary_embedding_interleaved_expanded",
-        "^test_rotary_embedding_no_position_ids_interleaved_expanded",
+        "^test_rotary_embedding_3d_input_expanded",  // win cuda fail
+        "^test_rotary_embedding_interleaved_expanded",  // win cuda fail
+        "^test_rotary_embedding_no_position_ids_interleaved_expanded",  // win cuda fail
         "^test_rotary_embedding_expanded", //webgpu
         "^test_rotary_embedding_no_position_ids_expanded", //webgpu
         // Following tests are for opset 16 ops and are not yet implemented in ORT
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index bae6f4cb51816..1b1dadeaf8db2 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 2871f5cab2ea2..dc394ff50f4f9 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.19.0
+onnx==1.19.1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 381d42831e715..2d89aece56340 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 4cc94f9148656..2fc034d9c5ca2 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -4,7 +4,7 @@ mypy
 pytest
 setuptools==78.1.1
 wheel==0.45.1
-onnx==1.19.0
+onnx==1.19.1
 argparse
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index d48fb66194f2a..293aa49823d48 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 6ab2ab2b7b61f..b48f6c3c2784d 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers

From cd4ac4943465fbbadbbc622dcfd11f453a27a2c1 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Tue, 14 Oct 2025 15:02:34 +0800
Subject: [PATCH 13/19] [webgpu] Enable indirect dispatch for flash attention
 (#26207)

This pull request introduces support for indirect dispatch in the WebGPU
FlashAttention implementation, enabling more dynamic and efficient
kernel launches based on runtime sequence lengths. The changes add new
logic and parameters to propagate sequence length information and
indirect dispatch buffers through the attention pipeline, with
conditional code paths to maintain compatibility with the existing
direct dispatch approach.

It's part of the work to enable graph capture in phi4
https://github.com/microsoft/onnxruntime/pull/25868
---
 .../webgpu/bert/flash_attention.cc            | 175 ++++++++++++++----
 .../contrib_ops/webgpu/bert/flash_attention.h |  31 ++--
 .../flash_attention_decode_qkt.wgsl.template  |  14 +-
 ...sh_attention_decode_split_vx.wgsl.template |  18 +-
 ...h_attention_decode_vx_reduce.wgsl.template |  12 +-
 .../webgpu/bert/group_query_attention.cc      |   2 +-
 .../core/providers/webgpu/compute_context.h   |   5 +-
 7 files changed, 193 insertions(+), 64 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index b5c1f73d1678d..a9bd4afc5cd09 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform);
   const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape");
+  // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output
+  if (prepare_indirect_dispatch_) {
+    shader.AddInput("seqlen_k", ShaderUsage::None);
+    shader.AddOutput("indirect_buffer", ShaderUsage::None);
+  }
 
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size")
                             << "  let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n"
@@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
                                "  let sequence_id = output_indices[2];\n"
                                "  let num_head_id = output_indices[1];\n"
                                "  let batch = output_indices[0];\n";
+  if (prepare_indirect_dispatch_) {
+    shader.MainFunctionBody() << "  let total_seq_length = u32(seqlen_k[0u]) + 1u;\n";
+  } else {
+    shader.MainFunctionBody() << "  let total_seq_length = uniforms.total_sequence_length;\n";
+  }
+
+  // Add indirect dispatch logic for thread 0
+  if (prepare_indirect_dispatch_) {
+    // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size.
+    shader.MainFunctionBody() << "  // Prepare indirect dispatch buffer for thread 0\n"
+                              << "  if (global_idx == 0u) {\n"
+                              << "    let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n"
+                              << "    indirect_buffer[0] = num_total_seq_length_tile;\n"
+                              << "    indirect_buffer[1] = uniforms.num_heads;\n"
+                              << "    indirect_buffer[2] = 1u;\n"
+                              << "  }\n\n";
+  }
+
   if (has_past_) {
-    shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n";
+    shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n";
     if (past_present_share_buffer_) {
       shader.MainFunctionBody() << "  let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n"
                                 << "  let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n"
@@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
 Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters,
                    const Tensor* K, const Tensor* past_key, Tensor* present_key,
-                   const Tensor* V, const Tensor* past_value, Tensor* present_value) {
+                   const Tensor* V, const Tensor* past_value, Tensor* present_value,
+                   uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) {
   // CopyKVCache takes past key/value and current key/value and copies them to present key and value.
   // This makes it so that FlashAttention only needs to look at present key and value, and saves
   // number of input buffers in the shader, which we run out of (<=8) without this optimization.
+  // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention.
   const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
   bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0;
   // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length.
@@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
   int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_;
   TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components};
   int64_t copy_size = copy_kv_shape.Size();
-  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_};
+
+  // Determine if we need to prepare indirect dispatch
+  bool prepare_indirect_dispatch = (indirect_buffer != nullptr);
+
+  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_,
+                             prepare_indirect_dispatch};
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, components}});
@@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}});
   }
+
+  if (prepare_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
+
   if (has_past && !parameters.past_present_share_buffer_) {
     program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   }
   program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
-                      {present_value, ProgramTensorMetadataDependency::Rank, components}})
-      .AddIndices(std::move(copy_kv_shape));
+                      {present_value, ProgramTensorMetadataDependency::Rank, components}});
+
+  if (prepare_indirect_dispatch) {
+    program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None});
+  }
+
+  program.AddIndices(std::move(copy_kv_shape));
   program.SetDispatchGroupSize(static_cast<uint32_t>((copy_size + 63) / 64))
       .SetWorkgroupSize(64)
-      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_)
+      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(copy_size)},
-                            // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to
-                            // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length.
-                            {static_cast<uint32_t>(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}});
+                            {static_cast<uint32_t>(parameters.total_sequence_length_)},
+                            {static_cast<uint32_t>(parameters.kv_sequence_length_)},
+                            {tile_size},
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
@@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   if (has_attention_bias_) {
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
@@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                              WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q,
-                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata,
-                                      const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile,
-                                      uint32_t num_present_sequence_length_tile, uint32_t tile_size,
-                                      uint32_t present_sequence_length) {
+                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k,
+                                      const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) {
   const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size_))
                                                 : parameters.scale_;
 
   const bool has_attention_bias = attention_bias != nullptr;
   const int components = 4;
 
-  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size};
+  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch};
   program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
                      {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   if (has_attention_bias) {
     program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
   }
@@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
                       {metadata, ProgramTensorMetadataDependency::Rank, 2}});
 
   const uint32_t vectorized_head_size = parameters.head_size_ / components;
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .SetWorkgroupSize(64)
-      .CacheHint(tile_size, has_attention_bias)
+  if (use_indirect_dispatch) {
+    program.SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.SetWorkgroupSize(64)
+      .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(vectorized_head_size)},
                             {static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<float>(alpha)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {num_total_seq_length_tile},
                             {num_present_sequence_length_tile},
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
   shader.AddInput("metadata", ShaderUsage::UseUniform);
   shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("out_split_vx", ShaderUsage::UseUniform);
 
   const uint32_t tile_size_k_vec = 8u;
@@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
                              WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context,
@@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
                                                const Tensor* qk,
                                                Tensor* out_split_vx,
                                                Tensor* present_value,
+                                               const Tensor* seqlen_k,
                                                const WebgpuAttentionParameters& parameters,
+                                               const Tensor* indirect_buffer,
                                                uint32_t num_total_seq_length_tile,
                                                uint32_t num_present_sequence_length_tile,
                                                uint32_t tile_size,
+                                               bool use_indirect_dispatch,
                                                uint32_t present_sequence_length) {
   const int components = 4;
   int head_size_vec = parameters.v_head_size_ / components;
-  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec};
+  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch};
   program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2},
                      {qk, ProgramTensorMetadataDependency::TypeAndRank},
                      {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});  // [B, N, split_k, head_size]
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .CacheHint(tile_size, head_size_vec)
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None})
+        .SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch)
       .SetWorkgroupSize(64)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(head_size_vec)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            num_total_seq_length_tile,
                             num_present_sequence_length_tile,
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
 
 Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("input", ShaderUsage::UseUniform);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
   return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template",
-                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_));
+                             WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context,
                                            const Tensor* out_split_vx,
                                            Tensor* output,
+                                           const Tensor* seqlen_k,
                                            const WebgpuAttentionParameters& parameters,
                                            uint32_t num_total_seq_length_tile,
-                                           uint32_t num_present_sequence_length_tile) {
+                                           uint32_t num_present_sequence_length_tile,
+                                           uint32_t seq_tile_size,
+                                           bool use_indirect_dispatch) {
   const int components = 4;
   constexpr int tile_size = 8;
   int tile_head_size = tile_size * components;
-  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size};
+  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch};
   program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}});
   const uint32_t num_head_size_tile = static_cast<uint32_t>((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size);
   program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile)
-      .CacheHint(tile_size)
+      .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch)
       .SetWorkgroupSize(tile_size * tile_size)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.v_head_size_ / components)},
                             num_total_seq_length_tile,
@@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext&
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) {
-  ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value));
-
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
   // Extract present_sequence_length directly from present_key tensor shape:
   // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size)
   const uint32_t present_sequence_length = static_cast<uint32_t>(present_key->Shape()[2]);
+
   if (parameters.sequence_length_ > 1) {
     const uint32_t tile_size = 64;
+    // For encode path, use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
     bool has_attention_bias = attention_bias != nullptr;
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
     bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"};
@@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     return context.RunProgram(program);
   }
 
-  // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled.
+  // For decode path (sequence_length == 1)
   const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_,
                                    parameters.sequence_length_, present_sequence_length});
   const TensorShape qk_shape(qk_dims);
@@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   constexpr uint32_t tile_size = 64;
   const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size;
   const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size;
+
+  // Determine if we should use indirect dispatch
+  const bool use_indirect_dispatch = parameters.past_present_share_buffer_ &&
+                                     seqlen_k != nullptr &&
+                                     context.IsGraphCaptureEnabled();
+
+  // Create indirect dispatch buffer if using indirect dispatch
+  Tensor* indirect_buffer_ptr = nullptr;
+  Tensor indirect_buffer;
+  if (use_indirect_dispatch) {
+    const TensorShape indirect_buffer_shape{3};  // 3 uint32 values for dispatch dimensions
+    indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), indirect_buffer_shape);
+    indirect_buffer_ptr = &indirect_buffer;
+    // Use the fused CopyKVCache that also prepares the indirect dispatch buffer
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr));
+  } else {
+    // Use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
+  }
+
   // The metadata is used to store the max and sum of each tile.
   const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_,
                                          num_present_sequence_length_tile, 2});
   const TensorShape metadata_shape(metadata_dims);
   Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), metadata_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata,
-                                                     parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size,
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k,
+                                                     parameters, indirect_buffer_ptr, num_total_seq_length_tile,
+                                                     num_present_sequence_length_tile, tile_size, use_indirect_dispatch,
                                                      present_sequence_length));
 
-  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_});
+  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_,
+                                             num_present_sequence_length_tile, parameters.head_size_});
   const TensorShape out_split_vx_shape(out_split_vx_dims);
   Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters,
-                                                              num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length));
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value,
+                                                              seqlen_k, parameters, indirect_buffer_ptr,
+                                                              num_total_seq_length_tile,
+                                                              num_present_sequence_length_tile, tile_size,
+                                                              use_indirect_dispatch, present_sequence_length));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters,
+                                                          num_total_seq_length_tile,
+                                                          num_present_sequence_length_tile, tile_size, use_indirect_dispatch));
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index c75494df253c1..7d71dc0f4d42d 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu;
 
 class CopyKVCacheProgram final : public Program<CopyKVCacheProgram> {
  public:
-  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer)
-      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) {
+  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer,
+                     bool prepare_indirect_dispatch = false)
+      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32},
-                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32});
+                                          {"total_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"tile_size", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_past_;
   bool kv_BNSH_;
   bool past_present_share_buffer_;
+  bool prepare_indirect_dispatch_;
 };
 
 class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
@@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
 class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecodeQKTProgram> {
  public:
   FlashAttentionDecodeQKTProgram(const std::string& kernel_name,
-                                 bool has_attention_bias, uint32_t tile_size)
-      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) {
+                                 bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecode
                                           {"alpha", ProgramUniformVariableDataType::Float32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
   uint32_t tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDecodeSplitVxProgram> {
  public:
-  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec)
-      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) {
+  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDe
                                           {"head_size_vec", ProgramUniformVariableDataType::Uint32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t tile_size_;
   int head_size_vec_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionDecodeVxReduceProgram> {
  public:
-  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size)
-      : Program{kernel_name}, tile_size_(tile_size) {
+  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionD
 
  private:
   uint32_t tile_size_;
+  uint32_t seq_tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k = nullptr);
 
 bool CanApplyFlashAttention(const Tensor* bias, const Tensor* present_key, const Tensor* present_value,
                             const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
index 7f41f2518b84b..c6f768beffa0f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
@@ -5,6 +5,7 @@
 #param tile_size
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -48,10 +49,15 @@ var<workgroup> tile_qk: array<q_element_t, tile_size>;
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
+#if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+#else
+  let total_sequence_length = uniforms.total_sequence_length;
+#endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let q_offset = head_idx * uniforms.head_size_vec;
-  var total_sequence_length = uniforms.total_sequence_length;
   let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec;
   for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) {
     if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) {
@@ -95,7 +101,7 @@ $MAIN {
     for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) {
       l_sum += exp(f32(tile_qk[i]) - l_max);
     }
-    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile;
+    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile;
     metadata[meta_offset] = metadata_value_t(l_max, l_sum);
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
index c7593af311ce2..37cf7e8f11b1f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
@@ -5,6 +5,7 @@
 #param head_size_vec
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -40,9 +41,14 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
-  var total_sequence_length = uniforms.total_sequence_length;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  #else
+  let total_sequence_length = uniforms.total_sequence_length;
+  #endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length;
 
   // Calculate the global max and sum in qk.
@@ -50,12 +56,12 @@ $MAIN {
   {
     var g_max = f32(-3.402823e+38f);
     var g_sum = f32(0);
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       g_max = max(g_max, metadata[meta_offset].x);
     }
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       let m_value = metadata[meta_offset];
@@ -95,7 +101,7 @@ $MAIN {
   }
 
   for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) {
-    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i;
+    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i;
     out_split_vx[out_offset] = tile_output[i];
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
index a4381baa638ce..22f18655307de 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#param seq_tile_size
 #param tile_size
+#param use_indirect_dispatch
 
 // Inputs are splits of the GQA output, split into num_total_seq_length_tiles
 // rows. This shader needs to add these splits across the row dimension to
@@ -23,10 +25,16 @@ $MAIN {
   var value = output_value_t(0);
   let local_row = u32(local_idx / tile_size);
   let local_col = local_idx % tile_size;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size;
+  #else
+  let num_total_seq_length_tile = uniforms.num_total_seq_length_tile;
+  #endif
 
   if (head_size_offset + local_col < uniforms.head_size_vec) {
-    for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) {
-      if (r + local_row < uniforms.num_total_seq_length_tile) {
+    for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) {
+      if (r + local_row < num_total_seq_length_tile) {
         value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col];
       }
     }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
index 8b7b257dd2852..cb845061404f3 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -206,7 +206,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
       !use_sliding_window &&
       CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) {
     return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value,
-                               present_value, parameters, context);
+                               present_value, parameters, context, seqlen_k);
   }
 
   Tensor qSplit;
diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h
index 315115390ff23..c4a88754deffe 100644
--- a/onnxruntime/core/providers/webgpu/compute_context.h
+++ b/onnxruntime/core/providers/webgpu/compute_context.h
@@ -8,6 +8,7 @@
 #include <utility>
 
 #include "core/framework/execution_provider.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
 
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -16,7 +17,6 @@
 namespace onnxruntime {
 
 class Tensor;
-class WebGpuExecutionProvider;
 
 namespace webgpu {
 
@@ -42,6 +42,9 @@ class ComputeContext {
   inline bool HasFeature(wgpu::FeatureName feature) const {
     return webgpu_context_.DeviceHasFeature(feature);
   }
+  inline bool IsGraphCaptureEnabled() const {
+    return ep_.IsGraphCaptureEnabled();
+  }
 #if !defined(__wasm__)
   inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const {
     return webgpu_context_.SubgroupMatrixConfigs();

From 5d786dfd39c4a2b561139a1cc6509d0f5d27eeef Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:20:00 -0400
Subject: [PATCH 14/19] [MIGraphX EP ] Add support for QLinearAveragePool and
 QLinearGlobalAveragePool (#26162)

### Description
<!-- Describe your changes. -->
Add support for MIgraphX EP operators QLinearGlobalAveragePool and
QLinaerAveragePool ops


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Want support for these operators through MIGraphX EP and MIGraphX
---
 .../core/providers/migraphx/migraphx_execution_provider.cc      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 239a5054801bc..55f901164bdac 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -953,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "QLinearAdd",
                                                     "QLinearConv",
                                                     "QLinearMatMul",
+                                                    "QLinearAveragePool",
+                                                    "QLinearGlobalAveragePool",
                                                     "QuantizeLinear",
                                                     "QuickGelu",
                                                     "DynamicQuantizeLinear",

From 04ed484f739b0807b47ad3abed67da63218ae9f0 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 15 Oct 2025 01:21:24 +0800
Subject: [PATCH 15/19] [WebNN] Fix some issues in reduction ops (#26289)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Allow empty axes input
- When axes is empty and ‘noop_with_empty_axes’ is true, WebNN should
set axes to []
- Simplify the code
---
 js/web/test/suite-test-list.jsonc             | 110 +++++++++---------
 .../core/providers/webnn/builders/helper.h    |   2 +-
 .../builders/impl/reduction_op_builder.cc     | 102 ++++++++--------
 3 files changed, 105 insertions(+), 109 deletions(-)

diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 3f1face2a043c..80991a3ebbb5f 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -2147,66 +2147,66 @@
       "test_reduce_log_sum_default",
       "test_reduce_log_sum_desc_axes",
       // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
       "test_reduce_log_sum_negative_axes",
       "test_reduce_log_sum",
       "test_reduce_max_default_axes_keepdim_example",
-      // "test_reduce_max_default_axes_keepdims_random",
-      // "test_reduce_max_do_not_keepdims_example",
-      // "test_reduce_max_do_not_keepdims_random",
-      // "test_reduce_max_keepdims_example",
-      // "test_reduce_max_keepdims_random",
-      // "test_reduce_max_negative_axes_keepdims_example",
-      // "test_reduce_max_negative_axes_keepdims_random",
-      // "test_reduce_mean_default_axes_keepdims_example",
-      // "test_reduce_mean_default_axes_keepdims_random",
-      // "test_reduce_mean_do_not_keepdims_example",
-      // "test_reduce_mean_do_not_keepdims_random",
-      // "test_reduce_mean_keepdims_example",
-      // "test_reduce_mean_keepdims_random",
-      // "test_reduce_mean_negative_axes_keepdims_example",
-      // "test_reduce_mean_negative_axes_keepdims_random",
-      // "test_reduce_min_default_axes_keepdims_example",
-      // "test_reduce_min_default_axes_keepdims_random",
-      // "test_reduce_min_do_not_keepdims_example",
-      // "test_reduce_min_do_not_keepdims_random",
-      // "test_reduce_min_keepdims_example",
-      // "test_reduce_min_keepdims_random",
-      // "test_reduce_min_negative_axes_keepdims_example",
-      // "test_reduce_min_negative_axes_keepdims_random",
-      // "test_reduce_prod_default_axes_keepdims_example",
-      // "test_reduce_prod_default_axes_keepdims_random",
-      // "test_reduce_prod_do_not_keepdims_example",
-      // "test_reduce_prod_do_not_keepdims_random",
-      // "test_reduce_prod_keepdims_example",
-      // "test_reduce_prod_keepdims_random",
-      // "test_reduce_prod_negative_axes_keepdims_example",
-      // "test_reduce_prod_negative_axes_keepdims_random",
-      // "test_reduce_sum_default_axes_keepdims_example",
-      // "test_reduce_sum_default_axes_keepdims_random",
-      // "test_reduce_sum_do_not_keepdims_example",
-      // "test_reduce_sum_do_not_keepdims_random",
+      "test_reduce_max_default_axes_keepdims_random",
+      "test_reduce_max_do_not_keepdims_example",
+      "test_reduce_max_do_not_keepdims_random",
+      "test_reduce_max_keepdims_example",
+      "test_reduce_max_keepdims_random",
+      "test_reduce_max_negative_axes_keepdims_example",
+      "test_reduce_max_negative_axes_keepdims_random",
+      "test_reduce_mean_default_axes_keepdims_example",
+      "test_reduce_mean_default_axes_keepdims_random",
+      "test_reduce_mean_do_not_keepdims_example",
+      "test_reduce_mean_do_not_keepdims_random",
+      "test_reduce_mean_keepdims_example",
+      "test_reduce_mean_keepdims_random",
+      "test_reduce_mean_negative_axes_keepdims_example",
+      "test_reduce_mean_negative_axes_keepdims_random",
+      "test_reduce_min_default_axes_keepdims_example",
+      "test_reduce_min_default_axes_keepdims_random",
+      "test_reduce_min_do_not_keepdims_example",
+      "test_reduce_min_do_not_keepdims_random",
+      "test_reduce_min_keepdims_example",
+      "test_reduce_min_keepdims_random",
+      "test_reduce_min_negative_axes_keepdims_example",
+      "test_reduce_min_negative_axes_keepdims_random",
+      "test_reduce_prod_default_axes_keepdims_example",
+      "test_reduce_prod_default_axes_keepdims_random",
+      "test_reduce_prod_do_not_keepdims_example",
+      "test_reduce_prod_do_not_keepdims_random",
+      "test_reduce_prod_keepdims_example",
+      "test_reduce_prod_keepdims_random",
+      "test_reduce_prod_negative_axes_keepdims_example",
+      "test_reduce_prod_negative_axes_keepdims_random",
+      "test_reduce_sum_default_axes_keepdims_example",
+      "test_reduce_sum_default_axes_keepdims_random",
+      "test_reduce_sum_do_not_keepdims_example",
+      "test_reduce_sum_do_not_keepdims_random",
       "test_reduce_sum_empty_axes_input_noop_example",
       "test_reduce_sum_empty_axes_input_noop_random",
-      // "test_reduce_sum_keepdims_example",
-      // "test_reduce_sum_keepdims_random",
-      // "test_reduce_sum_negative_axes_keepdims_example",
-      // "test_reduce_sum_negative_axes_keepdims_random",
-      // "test_reduce_sum_square_default_axes_keepdims_example",
-      // "test_reduce_sum_square_default_axes_keepdims_random",
-      // "test_reduce_sum_square_do_not_keepdims_example",
-      // "test_reduce_sum_square_do_not_keepdims_random",
-      // "test_reduce_sum_square_keepdims_example",
-      // "test_reduce_sum_square_keepdims_random",
-      // "test_reduce_sum_square_negative_axes_keepdims_example",
-      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      "test_reduce_sum_keepdims_example",
+      "test_reduce_sum_keepdims_random",
+      "test_reduce_sum_negative_axes_keepdims_example",
+      "test_reduce_sum_negative_axes_keepdims_random",
+      "test_reduce_sum_square_default_axes_keepdims_example",
+      "test_reduce_sum_square_default_axes_keepdims_random",
+      "test_reduce_sum_square_do_not_keepdims_example",
+      "test_reduce_sum_square_do_not_keepdims_random",
+      "test_reduce_sum_square_keepdims_example",
+      "test_reduce_sum_square_keepdims_random",
+      "test_reduce_sum_square_negative_axes_keepdims_example",
+      "test_reduce_sum_square_negative_axes_keepdims_random",
       // "test_reflect_pad",
       "test_relu",
       "test_reshape_allowzero_reordered",
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index baedb98a34c28..fbabc23504636 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type);
 // Collects all the initializer tensors in the subGraph and its ancestor graphs.
 InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
 
-inline std::vector<int64_t> HandleNegativeAxes(const std::vector<int64_t>& axes, size_t input_size) {
+inline std::vector<int64_t> HandleNegativeAxes(const gsl::span<const int64_t> axes, size_t input_size) {
   std::vector<int64_t> new_axes(axes.size());
   for (size_t i = 0; i < axes.size(); ++i) {
     new_axes[i] = HandleNegativeAxis(axes[i], input_size);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 6ea9b0a440d93..d07e636d578b1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -19,6 +19,8 @@ namespace webnn {
 class ReductionOpBuilder : public BaseOpBuilder {
   // Add operator related.
  public:
+  // Allow axes potentially being empty inputs that are ignored during processing.
+  ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   // Add operator related.
@@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() > 1) {
     model_builder.AddInitializerToSkip(input_defs[1]->Name());  // axes
+    model_builder.AddInputToSkip(input_defs[1]->Name());        // axes
   }
 }
 
@@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   NodeAttrHelper helper(node);
   const auto keep_dims = helper.Get("keepdims", 1);
+
   emscripten::val options = emscripten::val::object();
   options.set("label", node.Name());
   options.set("keepDimensions", keep_dims == 1);
-  std::vector<int32_t> axes_data;
-
-  emscripten::val output = emscripten::val::object();
 
+  std::vector<int64_t> axes_data;
   const auto opset = node.SinceVersion();
   const auto& op_type = node.OpType();
   if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) {
     // 'axes' is an optional input.
-    const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0);
-    if (!GetTensorName(input_defs, 1).empty()) {
-      // Optional input axes is provided, use axes initializer data.
-      const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
-      Initializer axes_initializer(axes_tensor);
-      const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
-      std::transform(
-          axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
-    } else {
-      if (noop_with_empty_axes) {
-        // When axes is empty and this attribute is set to true, input tensor will not be reduced.
-        output = input;
-        model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
-        return Status::OK();
+    std::vector<int64_t> axes_shape;
+    if (TensorExists(input_defs, 1)) {
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes");
+      if (axes_shape[0] != 0) {
+        // Optional input axes is provided and we already ensure it is an initializer.
+        // Use that initializer data.
+        const auto& initializers(model_builder.GetInitializerTensors());
+        const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
+        Initializer axes_initializer(axes_tensor);
+        const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
+        axes_data = HandleNegativeAxes(axes_data_span, input_rank);
       }
     }
   } else {
     if (helper.HasAttr("axes")) {
-      auto axes = helper.Get("axes", std::vector<int64_t>{});
-      std::transform(
-          axes.begin(), axes.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
+      axes_data = GetResolvedAxes(helper, input_rank);
     }
   }
-  if (axes_data.size() > 0) {
-    options.set("axes", emscripten::val::array(axes_data));
-  }
 
-  if (op_type == "ReduceL1") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL1", input, options);
-  } else if (op_type == "ReduceL2") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL2", input, options);
-  } else if (op_type == "ReduceLogSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSum", input, options);
-  } else if (op_type == "ReduceLogSumExp") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSumExp", input, options);
-  } else if (op_type == "ReduceMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMax", input, options);
-  } else if (op_type == "ReduceMean") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMean", input, options);
-  } else if (op_type == "ReduceMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMin", input, options);
-  } else if (op_type == "ReduceProd") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceProduct", input, options);
-  } else if (op_type == "ReduceSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", input, options);
-  } else if (op_type == "ReduceSumSquare") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSumSquare", input, options);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type);
+  // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute:
+  // - If it is false, perform reduction over all dimensions.
+  //   (In WebNN, this means the 'axes' option is not set.)
+  // - If it is true, no reduction is applied, but other operations are still performed.
+  //   (In WebNN, this requires setting 'axes' to an empty array.)
+  if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) {
+    options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64<uint32_t>(axes_data)));
   }
 
+  const std::string_view webnn_op_type = GetWebNNOpType(op_type);
+  ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type");
+
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>(
+      std::string(webnn_op_type).c_str(), input, options);
+
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
@@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                            const WebnnDeviceType /* device_type */,
                                            const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const std::string axes_name = GetTensorName(input_defs, 1);
-  // If the optional input 'axes' is provided, it must be an initializer.
-  if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) {
-    LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
-    return false;
+
+  if (TensorExists(input_defs, 1)) {
+    std::vector<int64_t> axes_shape;
+    if (!GetShape(*input_defs[1], axes_shape, logger)) {
+      LOGS(logger, VERBOSE) << "Cannot get shape of input axes";
+      return false;
+    }
+
+    if (axes_shape.size() != 1) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D";
+      return false;
+    }
+
+    const std::string axes_name = GetTensorName(input_defs, 1);
+    // If the optional input 'axes' is provided and not empty, it must be an initializer.
+    if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
+      return false;
+    }
   }
 
   return true;

From aafdb3a517d03b270b1bd9d648615d8bbfd7d619 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 14 Oct 2025 11:10:57 -0700
Subject: [PATCH 16/19] Fix shape inference failure with in-memory external
 data (#26263)

## Description

Fixes #26261

This PR resolves a regression introduced in v1.23.0 where models with
Constant nodes containing tensors larger than 127 bytes fail to load
with a shape inference error.

### Root Cause

Commit 3b97d79b3c (PR #25320) introduced an optimization to convert
large Constant node tensors (> 127 bytes) into OrtValues with in-memory
external data references for better memory management. However, ONNX
shape inference cannot distinguish between in-memory and file-based
external data, and rejects any TensorProto with `data_location =
EXTERNAL`.

### The Fix

Modified `InferenceContextImpl::getInputData()` to:
1. Detect tensors with in-memory external data using
`utils::HasExternalDataInMemory()`
2. Retrieve the corresponding OrtValue
3. Create a temporary TensorProto with embedded data (not external
reference)
4. Provide this temporary proto to ONNX shape inference

This allows ONNX shape inference to access the actual tensor data
without rejecting it as external.

### Memory Impact

This fix introduces a minor and temporary increase in memory usage
during the model loading phase.

- **When:** The additional memory is allocated only when the shape
inference engine needs to access the data of a constant tensor that is
larger than 127 bytes. This is a one-time event during the initial
analysis of the model.
- **What:** The fix creates a temporary in-memory copy of the tensor
data.
- **Duration:** This temporary copy is released as soon as shape
inference is complete.

The impact on the overall peak memory usage of the application is
expected to be negligible. The memory usage during inference is not
affected. While it is theoretically possible for the temporary tensor to
be large if a multi-gigabyte constant tensor is used for shape
inference, this is a highly unlikely scenario in practice for
well-designed models.

### Testing

- Tested with the problematic model from issue #26261
- All optimization levels now work correctly (DISABLE_ALL, BASIC,
EXTENDED, ALL)
- Unit tests to be added

### Changes

- **onnxruntime/core/graph/graph.cc**:
  - Modified `getInputData()` method in `InferenceContextImpl` class
- Added `temp_tensor_protos_` member to store temporary TensorProtos
during shape inference

## TODO

- [ ] Add unit tests
- [ ] Run full test suite

---------

Co-authored-by: Dmitri Smirnov <dmitrism@microsoft.com>
---
 onnxruntime/core/graph/graph.cc   |  26 +++
 onnxruntime/test/ir/graph_test.cc | 258 ++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+)

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3f6443aa73d4c..8b599dc86d997 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
     // only return data if it's for a constant initializer. checks for outer scope initializers
     // if this is a subgraph and the name isn't found locally.
     const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true);
+    if (initializer != nullptr) {
+      // Check if this is in-memory external data (data stored in OrtValue)
+      // ONNX shape inference cannot handle external data, so we need to materialize it
+      if (utils::HasExternalDataInMemory(*initializer)) {
+        // Try to get the OrtValue for this initializer
+        OrtValue ort_value;
+        if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) {
+          // Create a temporary TensorProto with the actual data from the OrtValue
+          // This allows ONNX shape inference to access the data
+          const Tensor& tensor = ort_value.Get<Tensor>();
+          auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false);
+          // Store the temporary proto so it outlives this call, maintain pointers steady
+          temp_tensor_protos_.push_back(std::make_unique<ONNX_NAMESPACE::TensorProto>(std::move(temp_tensor_proto)));
+          return temp_tensor_protos_.back().get();
+        } else {
+          // If we can't get the OrtValue, it is a bug
+          ORT_THROW("Initializer ", def->Name(),
+                    " has in-memory external data but cannot get OrtValue during shape inference");
+        }
+      }
+    }
     return initializer;
   }
 
@@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
   std::vector<std::unique_ptr<GraphInferencerImpl>> graph_inferencers_;
   const Graph& graph_;
   const Graph::ResolveOptions& options_;
+  // Temporary TensorProtos created for in-memory external data during shape inference
+  // These need to outlive the shape inference call, so we store them here
+  // Inference is per node and the instance of this context is on the stack,
+  // so this is safe.
+  mutable InlinedVector<std::unique_ptr<ONNX_NAMESPACE::TensorProto>> temp_tensor_protos_;
 };
 
 Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph,
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 4fd9830440846..7371ad5cf0ded 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -2,13 +2,17 @@
 // Licensed under the MIT License.
 
 #include <iostream>
+#include <fstream>
 #include "core/common/inlined_containers.h"
 #include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/graph/op.h"
+#include "core/session/inference_session.h"
+#include "core/session/environment.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/test_environment.h"
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 #include "onnx/defs/function.h"
@@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener
 
 #endif
 
+// Test for shape inference with in-memory external data (issue #26261)
+// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes)
+// stored as in-memory external data would cause shape inference to fail
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) {
+  // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes)
+  // This will trigger the in-memory externalization path
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  // Add the value attribute with a tensor
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);  // 16 elements * 8 bytes = 128 bytes
+  // Each split will be size 1, totaling 16
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node that uses the constant as input
+  // Split requires constant input for the 'split' parameter, which triggers shape inference
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");  // Use constant as split sizes
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  // Add axis attribute
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load the model - this should succeed with the fix
+  // Before the fix, this would fail with:
+  // "Cannot parse data from external tensors. Please load external data into raw data for tensor"
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  // Verify the graph was properly constructed
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Verify the constant node was converted to an initializer
+  const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer));
+  ASSERT_NE(initializer, nullptr);
+
+  // Verify the Split node can access the constant data during shape inference
+  const Node* split_node_ptr = nullptr;
+  for (const auto& node : graph.Nodes()) {
+    if (node.Name() == "split_node") {
+      split_node_ptr = &node;
+      break;
+    }
+  }
+  ASSERT_NE(split_node_ptr, nullptr);
+
+  // Verify outputs are properly shaped
+  ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u);
+}
+
+// Test for shape inference with in-memory external data using InferenceSession
+// This test more accurately reproduces the issue by going through the full session initialization
+// which includes graph optimizations that trigger the in-memory externalization
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) {
+  // Create the same model as above
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Save to a temporary file
+  const std::string model_path = "test_in_memory_external_data.onnx";
+  {
+    std::ofstream file(model_path, std::ios::binary);
+    ASSERT_TRUE(file.is_open());
+    ASSERT_TRUE(model_proto.SerializeToOstream(&file));
+  }
+
+  // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::Default;  // This triggers the issue
+  so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession";
+
+  InferenceSession session_object{so, GetEnvironment()};
+
+  // This should succeed with the fix, fail without it
+  ASSERT_STATUS_OK(session_object.Load(model_path));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  // Clean up
+  std::remove(model_path.c_str());
+}
+
+// Test that explicitly triggers the in-memory externalization and then shape inference
+// This test directly reproduces the bug scenario
+TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) {
+  // Create a model with a Split node that depends on a constant initializer
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create initializer directly (not as Constant node) with 128 bytes
+  auto* initializer = graph_proto->add_initializer();
+  initializer->set_name("split_sizes");
+  initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  initializer->add_dims(16);  // 16 * 8 = 128 bytes
+  for (int64_t i = 0; i < 16; ++i) {
+    initializer->add_int64_data(1);
+  }
+
+  // Create a Split node that uses this initializer
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("split_sizes");  // Uses the large initializer
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load model
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  Graph& graph = model->MainGraph();
+  // First resolve should succeed
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Now trigger the in-memory externalization
+  // This converts initializers > 127 bytes to OrtValues with external data references
+  Status convert_status = graph.ConvertInitializersIntoOrtValues();
+  ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage();
+
+  // Check if the initializer was actually externalized
+  const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after));
+  ASSERT_NE(initializer_after, nullptr);
+  // Debug: verify it was externalized
+  ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after))
+      << "Initializer was not externalized to in-memory external data";
+
+  // Mark the graph as needing resolve to force shape inference to run again
+  graph.SetGraphResolveNeeded();
+
+  // Resolve again - this should trigger shape inference with the externalized initializer
+  // Without the fix, this will fail with "Cannot parse data from external tensors"
+  // With the fix, getInputData() materializes the external data for shape inference
+  Status second_resolve = graph.Resolve();
+  ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage();
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 654137fab69f80265e5195deba209a635b4edb97 Mon Sep 17 00:00:00 2001
From: Yateng Hong <toothache9010@gmail.com>
Date: Wed, 15 Oct 2025 04:41:01 +0800
Subject: [PATCH 17/19] [TensorRT] Fix DDS output bug during engine update
 (#26272)

### Description
Fix a bug in the TRT Execution Provider where the DDS output tensor was
not bound after an engine update.


### Motivation and Context
The `dds_output_allocator_map` is not cleared on engine update, so that
it will mis-recognized as a known DDS and will not bind the output
allocation.

Script to reproduce the issue:
```:python
# create an onnx model with:
# inputs: data -> NonZeros(data) -> GatherND -> output
# then run the model with onnxruntime

def create_model():
    import onnx
    from onnx import helper, TensorProto

    input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
    output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])

    nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
    transpose_node = helper.make_node(
        "Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node"
    )
    gathernd_node = helper.make_node(
        "GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node"
    )

    value_info = [
        helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
        helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
    ]

    graph = helper.make_graph(
        [nonzeros_node, transpose_node, gathernd_node],
        "test_graph",
        [input],
        [output],
        value_info=value_info,
    )

    model = helper.make_model(graph)
    onnx.save(model, "model_dds.onnx")


def run_model():
    import onnxruntime as ort
    import numpy as np

    sess = ort.InferenceSession("model_dds.onnx", providers=["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"])

    print("Running with data shape (3,4)")
    data = np.random.randn(3, 4).astype(np.float32)
    sess.run(None, {"data": data})

    print("Running with data shape (5,6)")
    data = np.random.randn(5, 6).astype(np.float32)
    sess.run(None, {"data": data})


create_model()
run_model()
```

Before the change:
> IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter
check failed, condition:
mContext.profileObliviousBindings.at(profileObliviousIndex) ||
getPtrOrNull(mOutputAllocators, profileObliviousIndex). Neither address
or allocator is set for output tensor scores. Call
setOutputTensorAddress, setTensorAddress or setOutputAllocator before
enqueue/execute.) ... Status Message: TensorRT EP execution context
enqueue failed.
---
 .../tensorrt/tensorrt_execution_provider.cc   |  4 ++
 .../providers/tensorrt/tensorrt_basic_test.cc | 46 +++++++++++++++++++
 .../test/testdata/ort_github_issue_26272.py   | 26 +++++++++++
 .../testdata/ort_github_issue_26272_dds.onnx  | 28 +++++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_26272.py
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 508d932459bf9..cd0c0e4bffdb5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
       trt_state->context->reset();
       trt_state->engine->reset();
+
+      // Clear dds output allocator map since the engine and context will be recreated.
+      dds_output_allocator_map.clear();
+
       auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
       if (max_workspace_size_ > 0) {
         trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 327dfab96c2d1..a746493d779f8 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   ASSERT_TRUE(status.IsOK());
 }
 
+TEST(TensorrtExecutionProviderTest, DDSOutputTest) {
+  PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx");
+  SessionOptions so;
+  so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_op_x = {3, 4};
+  std::vector<float> values_op_x(12, 0.f);  // 12=3*4
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("output");
+  std::vector<OrtValue> fetches;
+
+  OrtTensorRTProviderOptionsV2 params;
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  // First pass run
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+
+  // Second pass run with new shape
+  dims_op_x = {6, 4};
+  values_op_x.resize(24, 0.f);  // 24=6*4
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+  feeds.clear();
+
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+}
+
 TEST_P(TensorrtExecutionProviderCacheTest, Run) {
   // GetParam() returns the parameter of following format:
   // ##cache type##_##input shape type##
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py
new file mode 100644
index 0000000000000..fa381e5df1094
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272.py
@@ -0,0 +1,26 @@
+import onnx
+from onnx import TensorProto, helper
+
+# Create a simple ONNX model with DDS output
+input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
+output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])
+
+nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
+transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node")
+gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node")
+
+value_info = [
+    helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
+    helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
+]
+
+graph = helper.make_graph(
+    [nonzeros_node, transpose_node, gathernd_node],
+    "test_graph",
+    [input],
+    [output],
+    value_info=value_info,
+)
+
+model = helper.make_model(graph)
+onnx.save(model, "ort_github_issue_26272_dds.onnx")
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
new file mode 100644
index 0000000000000..371f99c537898
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
@@ -0,0 +1,28 @@
+
+:�
+(
+datanonzerosnonzeros_node"NonZero
+1
+nonzeros
+nonzeros_ttranspose_node"	Transpose
+3
+data
+
+nonzeros_toutputgathernd_node"GatherND
+test_graphZ
+data
+
+d1
+d2b
+output
+
+nzrj
+nonzeros
+
+
+nzrj
+
+nonzeros_t
+
+nzr
+B
\ No newline at end of file

From f0015b956723ec98efa8292a0628f99a5b6e8b3d Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Wed, 15 Oct 2025 16:52:09 +0800
Subject: [PATCH 18/19] [webgpu] And int64 to cast (#25610)

This pull request extends the WebGPU execution provider to support int64
data type casting in the `Cast` operator, with conditional support based
on whether graph capture is enabled. It refactors kernel registration to
allow toggling int64 support and updates the shader code and kernel
logic to handle int64 tensors efficiently.

It's part of the work to enable graph capture in phi4
https://github.com/microsoft/onnxruntime/pull/25868
---
 .../core/providers/webgpu/shader_variable.cc  |   2 +-
 .../core/providers/webgpu/tensor/cast.cc      | 154 ++++++++++--------
 .../core/providers/webgpu/tensor/cast.h       |   9 +-
 .../webgpu/webgpu_execution_provider.cc       |  28 ++--
 4 files changed, 114 insertions(+), 79 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc
index 5998c22a0d2ca..aa1f6c9a0ec0b 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.cc
+++ b/onnxruntime/core/providers/webgpu/shader_variable.cc
@@ -378,7 +378,7 @@ std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std::
       ORT_THROW("Invalid type");
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Int64:
-      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));";
+      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));";
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Uint64:
       ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), 0u);";
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc
index 313a96ba25509..daf4aa323c12e 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc
@@ -11,75 +11,29 @@ namespace onnxruntime {
 namespace webgpu {
 
 namespace {
-const std::vector<MLDataType>& CastOpTypeConstraints() {
-  // currently support boolean, integer and float types that explicitly allowed in WGSL:
+const std::vector<MLDataType>& CastOpTypeConstraints(bool enable_graph_capture) {
+  // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL:
   // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section
-  //
-  static std::vector<MLDataType> types{
+  static std::vector<MLDataType> base_types{
       DataTypeImpl::GetTensorType<MLFloat16>(),
       DataTypeImpl::GetTensorType<float>(),
       DataTypeImpl::GetTensorType<int32_t>(),
       DataTypeImpl::GetTensorType<uint32_t>(),
       DataTypeImpl::GetTensorType<bool>()};
-  return types;
+
+  if (enable_graph_capture) {
+    static std::vector<MLDataType> types_with_int64 = []() {
+      auto types = base_types;
+      types.push_back(DataTypeImpl::GetTensorType<int64_t>());
+      return types;
+    }();
+    return types_with_int64;
+  } else {
+    return base_types;
+  }
 }
 }  // namespace
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    6, 8,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    9, 12,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    13, 18,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    19, 20,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    21, 22,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    23,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-
 Status Cast::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   auto* output_tensor = context.Output(0, input_tensor->Shape());
@@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const {
   if (size == 0) {
     return Status::OK();
   }
+  bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType<int64_t>();
+  const int in_components = is_from_int64 ? 1 : 4;
+  const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4;
   uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
+  uint32_t in_vec_size = onnxruntime::narrow<uint32_t>(in_components == 1 ? size : vec_size);
+  uint32_t out_vec_size = onnxruntime::narrow<uint32_t>(out_components == 1 ? size : vec_size);
 
-  CastProgram program{to_};
+  CastProgram program{to_, is_from_int64};
   program
-      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4})
-      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4})
+      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components})
+      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components})
       .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
           {static_cast<uint32_t>(vec_size)},
@@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
       expression = "vec4<bool>(a)";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      expression = "int32(a)";
+      break;
     default:
       ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported.");
   }
-  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")
-                        << "  let a = " << input.GetByOffset("global_idx") << ";\n  "
-                        << output.SetByOffset("global_idx", expression);
+
+  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
+  if (is_from_int64_) {
+    sh.MainFunctionBody() << "  let a0 = " << input.GetByOffset("global_idx * 4") << ";\n"
+                          << "  let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n"
+                          << "  let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n"
+                          << "  let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n"
+                          << "  let a = vec4<i32>(a0, a1, a2, a3);\n";
+  } else {
+    sh.MainFunctionBody() << "  let a = " << input.GetByOffset("global_idx") << ";\n";
+  }
+  if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n";
+  } else {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx", expression);
+  }
 
   return Status::OK();
 }
 
+template <int StartVersion, int EndVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) {
+  const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture);
+
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+    out = std::make_unique<Cast>(info);
+    return Status::OK();
+  };
+
+  if constexpr (StartVersion == EndVersion) {
+    // Non-versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  } else {
+    // Versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion, EndVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  }
+}
+
+// Explicit template instantiations
+template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool);
+template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool);
+template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool);
+template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool);
+template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool);
+template KernelCreateInfo CreateCastKernelInfo<23>(bool);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h
index 925cd200f0aba..7dfb50e3241c8 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.h
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "core/framework/kernel_registry.h"
+#include "core/framework/op_kernel.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
 namespace onnxruntime {
@@ -10,7 +12,7 @@ namespace webgpu {
 
 class CastProgram final : public Program<CastProgram> {
  public:
-  CastProgram(int32_t to) : Program{"Cast"}, to_{to} {}
+  CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -18,6 +20,7 @@ class CastProgram final : public Program<CastProgram> {
 
  private:
   int32_t to_;
+  bool is_from_int64_;
 };
 
 class Cast final : public WebGpuKernel {
@@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel {
   int32_t to_;
 };
 
+// Create Cast kernel info with appropriate type constraints based on graph capture support
+template <int StartVersion, int EndVersion = StartVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index bbb3fbdd221d3..0f7607ac1dbfe 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -28,6 +28,7 @@
 #include "core/providers/webgpu/data_transfer.h"
 #include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
+#include "core/providers/webgpu/tensor/cast.h"
 
 namespace onnxruntime {
 
@@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND);
 
-std::unique_ptr<KernelRegistry> RegisterKernels() {
+std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = false) {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -464,13 +465,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(13, Tanh),
       KERNEL_CREATE_INFO(1, Not),
 
-      KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast),
-      KERNEL_CREATE_INFO(23, Cast),
-
       // // activations
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, float, Clip)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, float, Clip)>,
@@ -771,6 +765,14 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
     }
   }
 
+  // Register Cast kernels with conditional int64 support based on graph capture
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture)));
+
 #ifndef DISABLE_CONTRIB_OPS
   Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry);
   ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage());
@@ -869,9 +871,13 @@ std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapa
 }
 
 std::shared_ptr<KernelRegistry> WebGpuExecutionProvider::GetKernelRegistry() const {
-  static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels();
-
-  return registry;
+  if (enable_graph_capture_) {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(true);
+    return registry;
+  } else {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false);
+    return registry;
+  }
 }
 
 std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTransfer() const {

From 036fde03d0fff715091a430d3fa44e4383defea3 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 15 Oct 2025 14:40:48 -0700
Subject: [PATCH 19/19] [Build] Lock torch, onnxscript and onnx-ir versions to
 latest (#26315)

To fix build pipeline error `ModuleNotFoundError: No module named
'onnxscript._framework_apis.torch_2_9'` after recent torch 2.9 release.

This locks torch version to 2.8, and also updates onnxscript and onnx-ir
to latest versions.

I locked torchvision version since it is usually installed with torch
together. If torch and torchvision are not compatible, there might be
errors in transformers script.
---
 tools/ci_build/build.py                                    | 2 +-
 .../ci_build/github/linux/docker/scripts/requirements.txt  | 4 ++--
 tools/ci_build/github/linux/python/requirements.txt        | 4 ++--
 tools/ci_build/github/windows/python/requirements.txt      | 4 ++--
 .../requirements/transformers-test/requirements.txt        | 7 ++++---
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 8a72ab70cc67d..591be538ac873 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1751,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 # Install cpu only version of torch when cuda is not enabled in Linux.
                 extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
                 run_subprocess(
-                    [sys.executable, "-m", "pip", "install", "torch", *extra],
+                    [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra],
                     cwd=cwd,
                     dll_path=dll_path,
                     python_path=python_path,
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 2fc034d9c5ca2..c19c0170291e6 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -10,5 +10,5 @@ sympy==1.14
 flatbuffers
 protobuf==4.25.1
 packaging
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 293aa49823d48..3ddce9cc0ec31 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -8,7 +8,7 @@ protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index b48f6c3c2784d..bb307a20d7f18 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -8,8 +8,8 @@ protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
 semver
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index bcd5a434c58e8..21894c2ba003d 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -3,12 +3,13 @@ packaging
 # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
 protobuf==4.25.1
 numpy==2.2.6
-torch>=2.6.0
+torch==2.8.0
+torchvision==0.23.0
 coloredlogs==15.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece
 psutil
 einops
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10