diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 42d1bdc295785..683c5594e82f2 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -20,7 +20,7 @@ permissions:
 
 jobs:
   build:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     env:
       DOCFXVERSION: 2.62.2
     steps:
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 437fc0e2c6334..3d24d4b6b75b6 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU CUDA CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -52,9 +52,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - uses: actions/setup-node@v5
         with:
@@ -111,7 +111,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -188,10 +188,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -199,9 +199,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - name: Set OnnxRuntimeBuildDirectory
         shell: pwsh
@@ -227,7 +227,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml
index 395ccfbe70244..b608c0879aa45 100644
--- a/.github/workflows/windows_openvino.yml
+++ b/.github/workflows/windows_openvino.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   BUILD_OPENVINO_EP:
     name: Windows OpenVINO CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 240
     env:
       AZCOPY_AUTO_LOGIN_TYPE: MSI
diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml
index 9788792b94fa8..1906fcb18c841 100644
--- a/.github/workflows/windows_qnn_x64.yml
+++ b/.github/workflows/windows_qnn_x64.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   build_test_qnn_ep:
     name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 120
     strategy:
       matrix: 
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 5f3dcb9607a47..2a1fe97d9b7b7 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU TensorRT CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -56,9 +56,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - uses: actions/setup-node@v5
@@ -116,7 +116,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -193,10 +193,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -208,9 +208,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - name: Set OnnxRuntimeBuildDirectory
@@ -237,7 +237,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml
index 6165375e7a54a..6a1b43e54ed89 100644
--- a/.github/workflows/windows_x64_debug_build_x64_debug.yml
+++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_debug:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml
index f9d7b0d9e9e04..0bcd282e8dc50 100644
--- a/.github/workflows/windows_x64_release_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
index 54c13e1e04b0a..3934047266f59 100644
--- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
+++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_ep_generic_interface:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
index 06230962b39be..1c38d8e58970c 100644
--- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_vitisai:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml
index 21033ef4cbe3c..6eb9f00d3997d 100644
--- a/.github/workflows/windows_x64_release_xnnpack.yml
+++ b/.github/workflows/windows_x64_release_xnnpack.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_xnnpack:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml
index fa1e9362e2f34..597c1c7f4b6cf 100644
--- a/.github/workflows/windows_x86.yml
+++ b/.github/workflows/windows_x86.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x86_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 116d369885a27..8186da507a442 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
 endif()
 
 
-# Single output director for all binaries
+# Single output directory for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
 
+# Local mirror directory of cmake dependencies
+set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
+set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies")
+
 
 include(FetchContent)
 
@@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD)
   set(onnxruntime_MINIMAL_BUILD ON)
 endif()
 
-set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
 set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
 set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)
 set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7b243ff15cd80..bf76753c1b3c0 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index 55059b9500a8e..e8044411e4201 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -4,11 +4,11 @@
 # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects.
 
 function(onnxruntime_fetchcontent_declare contentName)
+    cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "")
+    message(STATUS "Fetch ${contentName} from ${ARG_URL}")
     FetchContent_Declare(${ARGV})
     string(TOLOWER ${contentName} contentNameLower)
-    list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR)
-    if(index_SOURCE_SUBDIR GREATER_EQUAL 0)
-      cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") 
+    if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "")
       set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE)
     endif()
 endfunction()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 8e1a880579b34..b6a741d8b0fe7 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
 
     if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
       # Search a local mirror folder
-      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+      string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
 
       if(EXISTS "${LOCAL_URL}")
         cmake_path(ABSOLUTE_PATH LOCAL_URL)
@@ -498,13 +498,7 @@ else()
 endif()
 
 if(Patch_FOUND)
-  set(ONNXRUNTIME_ONNX_PATCH_COMMAND
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch &&
-      # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-      # This change should be included in ONNX 1.19.1.
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 <
-          ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch
-      )
+  set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch)
 else()
   set(ONNXRUNTIME_ONNX_PATCH_COMMAND "")
 endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 68a3e9014b7b0..1d31eb1fbd207 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -182,8 +182,8 @@
 
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
-      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--relocatable-device-code=true>")
-      set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>")
       endif()
diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index 27f5ea5fadd79..882850963a0c0 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -4,12 +4,9 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
     REF "v${VERSION}"
-    SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0
+    SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60
     PATCHES
         fix-cmakelists.patch
-        # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-        # This change should be included in ONNX 1.19.1.
-        avoid_regenerating_proto_files.patch
         fix-dependency-protobuf.patch
         binskim.patch
 )
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
index 350db2e35061a..ad0d1aaf15f51 100644
--- a/cmake/vcpkg-ports/onnx/vcpkg.json
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "onnx",
-  "version-semver": "1.19.0",
+  "version-semver": "1.19.1",
   "port-version": 1,
   "description": "Open standard for machine learning interoperability",
   "homepage": "https://onnx.ai",
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 8da19ddc51cb7..8c1280431c384 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -35,7 +35,7 @@ git add onnx
 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch).
 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version.
 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.)
-4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
+4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
 
 Alternatively, directly run Terrapin to upload ONNX package (need SHA512):
 
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 295aacc6fffa3..ea88f291e5597 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | GatherElements | ai.onnx(11-12, 13+) | gatherElements | |
 | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 |
 | Gelu | ai.onnx(20+) | gelu | |
-| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input |
+| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | |
 | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input |
 | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input |
 | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 48f0a8f3e9d5c..e880f6bca2ac4 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.3.5"
+        "vite": "^6.3.6"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1114,9 +1114,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.3.5",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz",
-      "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==",
+      "version": "6.3.6",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz",
+      "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index f7d5751354905..84013e2aecb88 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.3.5"
+    "vite": "^6.3.6"
   }
 }
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 3f1face2a043c..80991a3ebbb5f 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -2147,66 +2147,66 @@
       "test_reduce_log_sum_default",
       "test_reduce_log_sum_desc_axes",
       // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
       "test_reduce_log_sum_negative_axes",
       "test_reduce_log_sum",
       "test_reduce_max_default_axes_keepdim_example",
-      // "test_reduce_max_default_axes_keepdims_random",
-      // "test_reduce_max_do_not_keepdims_example",
-      // "test_reduce_max_do_not_keepdims_random",
-      // "test_reduce_max_keepdims_example",
-      // "test_reduce_max_keepdims_random",
-      // "test_reduce_max_negative_axes_keepdims_example",
-      // "test_reduce_max_negative_axes_keepdims_random",
-      // "test_reduce_mean_default_axes_keepdims_example",
-      // "test_reduce_mean_default_axes_keepdims_random",
-      // "test_reduce_mean_do_not_keepdims_example",
-      // "test_reduce_mean_do_not_keepdims_random",
-      // "test_reduce_mean_keepdims_example",
-      // "test_reduce_mean_keepdims_random",
-      // "test_reduce_mean_negative_axes_keepdims_example",
-      // "test_reduce_mean_negative_axes_keepdims_random",
-      // "test_reduce_min_default_axes_keepdims_example",
-      // "test_reduce_min_default_axes_keepdims_random",
-      // "test_reduce_min_do_not_keepdims_example",
-      // "test_reduce_min_do_not_keepdims_random",
-      // "test_reduce_min_keepdims_example",
-      // "test_reduce_min_keepdims_random",
-      // "test_reduce_min_negative_axes_keepdims_example",
-      // "test_reduce_min_negative_axes_keepdims_random",
-      // "test_reduce_prod_default_axes_keepdims_example",
-      // "test_reduce_prod_default_axes_keepdims_random",
-      // "test_reduce_prod_do_not_keepdims_example",
-      // "test_reduce_prod_do_not_keepdims_random",
-      // "test_reduce_prod_keepdims_example",
-      // "test_reduce_prod_keepdims_random",
-      // "test_reduce_prod_negative_axes_keepdims_example",
-      // "test_reduce_prod_negative_axes_keepdims_random",
-      // "test_reduce_sum_default_axes_keepdims_example",
-      // "test_reduce_sum_default_axes_keepdims_random",
-      // "test_reduce_sum_do_not_keepdims_example",
-      // "test_reduce_sum_do_not_keepdims_random",
+      "test_reduce_max_default_axes_keepdims_random",
+      "test_reduce_max_do_not_keepdims_example",
+      "test_reduce_max_do_not_keepdims_random",
+      "test_reduce_max_keepdims_example",
+      "test_reduce_max_keepdims_random",
+      "test_reduce_max_negative_axes_keepdims_example",
+      "test_reduce_max_negative_axes_keepdims_random",
+      "test_reduce_mean_default_axes_keepdims_example",
+      "test_reduce_mean_default_axes_keepdims_random",
+      "test_reduce_mean_do_not_keepdims_example",
+      "test_reduce_mean_do_not_keepdims_random",
+      "test_reduce_mean_keepdims_example",
+      "test_reduce_mean_keepdims_random",
+      "test_reduce_mean_negative_axes_keepdims_example",
+      "test_reduce_mean_negative_axes_keepdims_random",
+      "test_reduce_min_default_axes_keepdims_example",
+      "test_reduce_min_default_axes_keepdims_random",
+      "test_reduce_min_do_not_keepdims_example",
+      "test_reduce_min_do_not_keepdims_random",
+      "test_reduce_min_keepdims_example",
+      "test_reduce_min_keepdims_random",
+      "test_reduce_min_negative_axes_keepdims_example",
+      "test_reduce_min_negative_axes_keepdims_random",
+      "test_reduce_prod_default_axes_keepdims_example",
+      "test_reduce_prod_default_axes_keepdims_random",
+      "test_reduce_prod_do_not_keepdims_example",
+      "test_reduce_prod_do_not_keepdims_random",
+      "test_reduce_prod_keepdims_example",
+      "test_reduce_prod_keepdims_random",
+      "test_reduce_prod_negative_axes_keepdims_example",
+      "test_reduce_prod_negative_axes_keepdims_random",
+      "test_reduce_sum_default_axes_keepdims_example",
+      "test_reduce_sum_default_axes_keepdims_random",
+      "test_reduce_sum_do_not_keepdims_example",
+      "test_reduce_sum_do_not_keepdims_random",
       "test_reduce_sum_empty_axes_input_noop_example",
       "test_reduce_sum_empty_axes_input_noop_random",
-      // "test_reduce_sum_keepdims_example",
-      // "test_reduce_sum_keepdims_random",
-      // "test_reduce_sum_negative_axes_keepdims_example",
-      // "test_reduce_sum_negative_axes_keepdims_random",
-      // "test_reduce_sum_square_default_axes_keepdims_example",
-      // "test_reduce_sum_square_default_axes_keepdims_random",
-      // "test_reduce_sum_square_do_not_keepdims_example",
-      // "test_reduce_sum_square_do_not_keepdims_random",
-      // "test_reduce_sum_square_keepdims_example",
-      // "test_reduce_sum_square_keepdims_random",
-      // "test_reduce_sum_square_negative_axes_keepdims_example",
-      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      "test_reduce_sum_keepdims_example",
+      "test_reduce_sum_keepdims_random",
+      "test_reduce_sum_negative_axes_keepdims_example",
+      "test_reduce_sum_negative_axes_keepdims_random",
+      "test_reduce_sum_square_default_axes_keepdims_example",
+      "test_reduce_sum_square_default_axes_keepdims_random",
+      "test_reduce_sum_square_do_not_keepdims_example",
+      "test_reduce_sum_square_do_not_keepdims_random",
+      "test_reduce_sum_square_keepdims_example",
+      "test_reduce_sum_square_keepdims_random",
+      "test_reduce_sum_square_negative_axes_keepdims_example",
+      "test_reduce_sum_square_negative_axes_keepdims_random",
       // "test_reflect_pad",
       "test_relu",
       "test_reshape_allowzero_reordered",
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
index 469084e7b4491..c51fc1cf54815 100644
--- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
@@ -130,6 +130,11 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
         allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V));
   }
 
+  OrtValue RotaryQKV;
+  OrtValue RotaryQ;
+  OrtValue RotaryK;
+  T* q_rotary = Q.GetMutable<Tensor>()->MutableData<T>();
+  T* k_rotary = packed_qkv ? nullptr : K.GetMutable<Tensor>()->MutableData<T>();
   if (do_rotary_) {
     rotary_embedding_helper::RotaryParameters rotary_params = {};
     rotary_params.batch_size = batch_size;
@@ -167,30 +172,22 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
     const T* q_input;
     const T* k_input;
-    T* q_rotary;
-    T* k_rotary;
     if (packed_qkv) {
-      OrtValue RotaryQKV;
       TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = q_input + num_heads_ * sequence_length * head_size;
       q_rotary = RotaryQKV.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = q_rotary + num_heads_ * sequence_length * head_size;
-      Q = RotaryQKV;
     } else {
-      OrtValue RotaryQ;
       TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ);
-      OrtValue RotaryK;
       TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = K.Get<Tensor>().Data<T>();
       q_rotary = RotaryQ.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = RotaryK.GetMutable<Tensor>()->MutableData<T>();
-      Q = RotaryQ;
-      K = RotaryK;
     }
 
     ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, q_input,
@@ -221,9 +218,8 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
   // Compute the attention score and apply the score to V
-  return ApplyAttention(Q.Get<Tensor>().Data<T>(), packed_qkv ? nullptr : K.Get<Tensor>().Data<T>(),
-                        packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(), past_key, past_value,
-                        output, present_key, present_value,
+  return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(),
+                        past_key, past_value, output, present_key, present_value,
                         total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context);
 }
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index b5c1f73d1678d..a9bd4afc5cd09 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform);
   const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape");
+  // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output
+  if (prepare_indirect_dispatch_) {
+    shader.AddInput("seqlen_k", ShaderUsage::None);
+    shader.AddOutput("indirect_buffer", ShaderUsage::None);
+  }
 
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size")
                             << "  let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n"
@@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
                                "  let sequence_id = output_indices[2];\n"
                                "  let num_head_id = output_indices[1];\n"
                                "  let batch = output_indices[0];\n";
+  if (prepare_indirect_dispatch_) {
+    shader.MainFunctionBody() << "  let total_seq_length = u32(seqlen_k[0u]) + 1u;\n";
+  } else {
+    shader.MainFunctionBody() << "  let total_seq_length = uniforms.total_sequence_length;\n";
+  }
+
+  // Add indirect dispatch logic for thread 0
+  if (prepare_indirect_dispatch_) {
+    // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size.
+    shader.MainFunctionBody() << "  // Prepare indirect dispatch buffer for thread 0\n"
+                              << "  if (global_idx == 0u) {\n"
+                              << "    let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n"
+                              << "    indirect_buffer[0] = num_total_seq_length_tile;\n"
+                              << "    indirect_buffer[1] = uniforms.num_heads;\n"
+                              << "    indirect_buffer[2] = 1u;\n"
+                              << "  }\n\n";
+  }
+
   if (has_past_) {
-    shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n";
+    shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n";
     if (past_present_share_buffer_) {
       shader.MainFunctionBody() << "  let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n"
                                 << "  let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n"
@@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
 Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters,
                    const Tensor* K, const Tensor* past_key, Tensor* present_key,
-                   const Tensor* V, const Tensor* past_value, Tensor* present_value) {
+                   const Tensor* V, const Tensor* past_value, Tensor* present_value,
+                   uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) {
   // CopyKVCache takes past key/value and current key/value and copies them to present key and value.
   // This makes it so that FlashAttention only needs to look at present key and value, and saves
   // number of input buffers in the shader, which we run out of (<=8) without this optimization.
+  // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention.
   const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
   bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0;
   // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length.
@@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
   int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_;
   TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components};
   int64_t copy_size = copy_kv_shape.Size();
-  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_};
+
+  // Determine if we need to prepare indirect dispatch
+  bool prepare_indirect_dispatch = (indirect_buffer != nullptr);
+
+  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_,
+                             prepare_indirect_dispatch};
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, components}});
@@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}});
   }
+
+  if (prepare_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
+
   if (has_past && !parameters.past_present_share_buffer_) {
     program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   }
   program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
-                      {present_value, ProgramTensorMetadataDependency::Rank, components}})
-      .AddIndices(std::move(copy_kv_shape));
+                      {present_value, ProgramTensorMetadataDependency::Rank, components}});
+
+  if (prepare_indirect_dispatch) {
+    program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None});
+  }
+
+  program.AddIndices(std::move(copy_kv_shape));
   program.SetDispatchGroupSize(static_cast<uint32_t>((copy_size + 63) / 64))
       .SetWorkgroupSize(64)
-      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_)
+      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(copy_size)},
-                            // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to
-                            // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length.
-                            {static_cast<uint32_t>(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}});
+                            {static_cast<uint32_t>(parameters.total_sequence_length_)},
+                            {static_cast<uint32_t>(parameters.kv_sequence_length_)},
+                            {tile_size},
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
@@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   if (has_attention_bias_) {
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
@@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                              WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q,
-                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata,
-                                      const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile,
-                                      uint32_t num_present_sequence_length_tile, uint32_t tile_size,
-                                      uint32_t present_sequence_length) {
+                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k,
+                                      const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) {
   const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size_))
                                                 : parameters.scale_;
 
   const bool has_attention_bias = attention_bias != nullptr;
   const int components = 4;
 
-  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size};
+  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch};
   program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
                      {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   if (has_attention_bias) {
     program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
   }
@@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
                       {metadata, ProgramTensorMetadataDependency::Rank, 2}});
 
   const uint32_t vectorized_head_size = parameters.head_size_ / components;
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .SetWorkgroupSize(64)
-      .CacheHint(tile_size, has_attention_bias)
+  if (use_indirect_dispatch) {
+    program.SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.SetWorkgroupSize(64)
+      .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(vectorized_head_size)},
                             {static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<float>(alpha)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {num_total_seq_length_tile},
                             {num_present_sequence_length_tile},
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
   shader.AddInput("metadata", ShaderUsage::UseUniform);
   shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("out_split_vx", ShaderUsage::UseUniform);
 
   const uint32_t tile_size_k_vec = 8u;
@@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
                              WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context,
@@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
                                                const Tensor* qk,
                                                Tensor* out_split_vx,
                                                Tensor* present_value,
+                                               const Tensor* seqlen_k,
                                                const WebgpuAttentionParameters& parameters,
+                                               const Tensor* indirect_buffer,
                                                uint32_t num_total_seq_length_tile,
                                                uint32_t num_present_sequence_length_tile,
                                                uint32_t tile_size,
+                                               bool use_indirect_dispatch,
                                                uint32_t present_sequence_length) {
   const int components = 4;
   int head_size_vec = parameters.v_head_size_ / components;
-  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec};
+  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch};
   program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2},
                      {qk, ProgramTensorMetadataDependency::TypeAndRank},
                      {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});  // [B, N, split_k, head_size]
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .CacheHint(tile_size, head_size_vec)
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None})
+        .SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch)
       .SetWorkgroupSize(64)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(head_size_vec)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            num_total_seq_length_tile,
                             num_present_sequence_length_tile,
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
 
 Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("input", ShaderUsage::UseUniform);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
   return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template",
-                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_));
+                             WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context,
                                            const Tensor* out_split_vx,
                                            Tensor* output,
+                                           const Tensor* seqlen_k,
                                            const WebgpuAttentionParameters& parameters,
                                            uint32_t num_total_seq_length_tile,
-                                           uint32_t num_present_sequence_length_tile) {
+                                           uint32_t num_present_sequence_length_tile,
+                                           uint32_t seq_tile_size,
+                                           bool use_indirect_dispatch) {
   const int components = 4;
   constexpr int tile_size = 8;
   int tile_head_size = tile_size * components;
-  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size};
+  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch};
   program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}});
   const uint32_t num_head_size_tile = static_cast<uint32_t>((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size);
   program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile)
-      .CacheHint(tile_size)
+      .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch)
       .SetWorkgroupSize(tile_size * tile_size)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.v_head_size_ / components)},
                             num_total_seq_length_tile,
@@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext&
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) {
-  ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value));
-
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
   // Extract present_sequence_length directly from present_key tensor shape:
   // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size)
   const uint32_t present_sequence_length = static_cast<uint32_t>(present_key->Shape()[2]);
+
   if (parameters.sequence_length_ > 1) {
     const uint32_t tile_size = 64;
+    // For encode path, use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
     bool has_attention_bias = attention_bias != nullptr;
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
     bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"};
@@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     return context.RunProgram(program);
   }
 
-  // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled.
+  // For decode path (sequence_length == 1)
   const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_,
                                    parameters.sequence_length_, present_sequence_length});
   const TensorShape qk_shape(qk_dims);
@@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   constexpr uint32_t tile_size = 64;
   const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size;
   const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size;
+
+  // Determine if we should use indirect dispatch
+  const bool use_indirect_dispatch = parameters.past_present_share_buffer_ &&
+                                     seqlen_k != nullptr &&
+                                     context.IsGraphCaptureEnabled();
+
+  // Create indirect dispatch buffer if using indirect dispatch
+  Tensor* indirect_buffer_ptr = nullptr;
+  Tensor indirect_buffer;
+  if (use_indirect_dispatch) {
+    const TensorShape indirect_buffer_shape{3};  // 3 uint32 values for dispatch dimensions
+    indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), indirect_buffer_shape);
+    indirect_buffer_ptr = &indirect_buffer;
+    // Use the fused CopyKVCache that also prepares the indirect dispatch buffer
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr));
+  } else {
+    // Use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
+  }
+
   // The metadata is used to store the max and sum of each tile.
   const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_,
                                          num_present_sequence_length_tile, 2});
   const TensorShape metadata_shape(metadata_dims);
   Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), metadata_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata,
-                                                     parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size,
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k,
+                                                     parameters, indirect_buffer_ptr, num_total_seq_length_tile,
+                                                     num_present_sequence_length_tile, tile_size, use_indirect_dispatch,
                                                      present_sequence_length));
 
-  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_});
+  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_,
+                                             num_present_sequence_length_tile, parameters.head_size_});
   const TensorShape out_split_vx_shape(out_split_vx_dims);
   Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters,
-                                                              num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length));
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value,
+                                                              seqlen_k, parameters, indirect_buffer_ptr,
+                                                              num_total_seq_length_tile,
+                                                              num_present_sequence_length_tile, tile_size,
+                                                              use_indirect_dispatch, present_sequence_length));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters,
+                                                          num_total_seq_length_tile,
+                                                          num_present_sequence_length_tile, tile_size, use_indirect_dispatch));
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index c75494df253c1..7d71dc0f4d42d 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu;
 
 class CopyKVCacheProgram final : public Program<CopyKVCacheProgram> {
  public:
-  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer)
-      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) {
+  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer,
+                     bool prepare_indirect_dispatch = false)
+      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32},
-                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32});
+                                          {"total_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"tile_size", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_past_;
   bool kv_BNSH_;
   bool past_present_share_buffer_;
+  bool prepare_indirect_dispatch_;
 };
 
 class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
@@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
 class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecodeQKTProgram> {
  public:
   FlashAttentionDecodeQKTProgram(const std::string& kernel_name,
-                                 bool has_attention_bias, uint32_t tile_size)
-      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) {
+                                 bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecode
                                           {"alpha", ProgramUniformVariableDataType::Float32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
   uint32_t tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDecodeSplitVxProgram> {
  public:
-  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec)
-      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) {
+  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDe
                                           {"head_size_vec", ProgramUniformVariableDataType::Uint32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t tile_size_;
   int head_size_vec_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionDecodeVxReduceProgram> {
  public:
-  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size)
-      : Program{kernel_name}, tile_size_(tile_size) {
+  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionD
 
  private:
   uint32_t tile_size_;
+  uint32_t seq_tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k = nullptr);
 
 bool CanApplyFlashAttention(const Tensor* bias, const Tensor* present_key, const Tensor* present_value,
                             const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
index 7f41f2518b84b..c6f768beffa0f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
@@ -5,6 +5,7 @@
 #param tile_size
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -48,10 +49,15 @@ var<workgroup> tile_qk: array<q_element_t, tile_size>;
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
+#if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+#else
+  let total_sequence_length = uniforms.total_sequence_length;
+#endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let q_offset = head_idx * uniforms.head_size_vec;
-  var total_sequence_length = uniforms.total_sequence_length;
   let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec;
   for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) {
     if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) {
@@ -95,7 +101,7 @@ $MAIN {
     for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) {
       l_sum += exp(f32(tile_qk[i]) - l_max);
     }
-    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile;
+    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile;
     metadata[meta_offset] = metadata_value_t(l_max, l_sum);
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
index c7593af311ce2..37cf7e8f11b1f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
@@ -5,6 +5,7 @@
 #param head_size_vec
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -40,9 +41,14 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
-  var total_sequence_length = uniforms.total_sequence_length;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  #else
+  let total_sequence_length = uniforms.total_sequence_length;
+  #endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length;
 
   // Calculate the global max and sum in qk.
@@ -50,12 +56,12 @@ $MAIN {
   {
     var g_max = f32(-3.402823e+38f);
     var g_sum = f32(0);
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       g_max = max(g_max, metadata[meta_offset].x);
     }
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       let m_value = metadata[meta_offset];
@@ -95,7 +101,7 @@ $MAIN {
   }
 
   for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) {
-    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i;
+    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i;
     out_split_vx[out_offset] = tile_output[i];
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
index a4381baa638ce..22f18655307de 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#param seq_tile_size
 #param tile_size
+#param use_indirect_dispatch
 
 // Inputs are splits of the GQA output, split into num_total_seq_length_tiles
 // rows. This shader needs to add these splits across the row dimension to
@@ -23,10 +25,16 @@ $MAIN {
   var value = output_value_t(0);
   let local_row = u32(local_idx / tile_size);
   let local_col = local_idx % tile_size;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size;
+  #else
+  let num_total_seq_length_tile = uniforms.num_total_seq_length_tile;
+  #endif
 
   if (head_size_offset + local_col < uniforms.head_size_vec) {
-    for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) {
-      if (r + local_row < uniforms.num_total_seq_length_tile) {
+    for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) {
+      if (r + local_row < num_total_seq_length_tile) {
         value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col];
       }
     }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
index 8b7b257dd2852..cb845061404f3 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -206,7 +206,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
       !use_sliding_window &&
       CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) {
     return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value,
-                               present_value, parameters, context);
+                               present_value, parameters, context, seqlen_k);
   }
 
   Tensor qSplit;
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3f6443aa73d4c..8b599dc86d997 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
     // only return data if it's for a constant initializer. checks for outer scope initializers
     // if this is a subgraph and the name isn't found locally.
     const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true);
+    if (initializer != nullptr) {
+      // Check if this is in-memory external data (data stored in OrtValue)
+      // ONNX shape inference cannot handle external data, so we need to materialize it
+      if (utils::HasExternalDataInMemory(*initializer)) {
+        // Try to get the OrtValue for this initializer
+        OrtValue ort_value;
+        if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) {
+          // Create a temporary TensorProto with the actual data from the OrtValue
+          // This allows ONNX shape inference to access the data
+          const Tensor& tensor = ort_value.Get<Tensor>();
+          auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false);
+          // Store the temporary proto so it outlives this call, maintain pointers steady
+          temp_tensor_protos_.push_back(std::make_unique<ONNX_NAMESPACE::TensorProto>(std::move(temp_tensor_proto)));
+          return temp_tensor_protos_.back().get();
+        } else {
+          // If we can't get the OrtValue, it is a bug
+          ORT_THROW("Initializer ", def->Name(),
+                    " has in-memory external data but cannot get OrtValue during shape inference");
+        }
+      }
+    }
     return initializer;
   }
 
@@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
   std::vector<std::unique_ptr<GraphInferencerImpl>> graph_inferencers_;
   const Graph& graph_;
   const Graph::ResolveOptions& options_;
+  // Temporary TensorProtos created for in-memory external data during shape inference
+  // These need to outlive the shape inference call, so we store them here
+  // Inference is per node and the instance of this context is on the stack,
+  // so this is safe.
+  mutable InlinedVector<std::unique_ptr<ONNX_NAMESPACE::TensorProto>> temp_tensor_protos_;
 };
 
 Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph,
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index 5136061c4769d..2e9c4574fd057 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -115,3 +115,37 @@ MlasConv(
     MLAS_THREADPOOL* ThreadPool
     );
 }
+
+/*++
+
+Routine Description:
+
+    This routine determines if a wraparound will occur when multiplying two size_t variables
+    Uses __builtin_mul_overflow if available on the current system and if not falls back
+    to a default implementation to check this wraparound.
+
+Arguments:
+
+    a - Supplies the first number to be muliplied.
+
+    b - Supplies the second number to be muliplied.
+
+    out - pointer to a size_t which acts as the return value in success cases.
+
+Return Value:
+
+    Returns false if the operation was successful
+    Returns true if wraparound of size_t was detected
+
+--*/
+inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) {
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_mul_overflow)
+    return __builtin_mul_overflow(a, b, out);
+#  endif
+#endif
+    // Fallback to manual check if builtin not available
+    if (b != 0 && a > SIZE_MAX / b) return true;
+    if (out) *out = a * b;
+    return false;
+}
diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
index ea38f16205a7c..435ff1fb10017 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
@@ -14,6 +14,16 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h"
 #include "mlasi_kleidiai.h"
 
+
+// Thread-local reusable buffers to reduce allocation overhead across tiles.
+struct KaiTlsBuffers {
+    std::vector<float> output_tile;
+    std::vector<float> bias_zero;
+    std::vector<std::byte> rhs_packed;
+    std::vector<std::byte> lhs_packed;
+};
+static thread_local KaiTlsBuffers g_kai_tls;
+
 size_t
 MLASCALL
 ArmKleidiAI::MlasGemmPackBSize(
@@ -51,7 +61,6 @@ Return Value:
     // Compute the number of bytes required to hold the packed buffer.
     //
     size_t bytes = 0;
-
     if (TransA == CblasNoTrans) {
         switch (TransB) {
             case CblasNoTrans:
@@ -125,15 +134,15 @@ Return Value:
         const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                                   : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-        // pass zeroed bias values
-        const std::vector<float> bias(N);
+        // Ensure size and zero the used span.
+        g_kai_tls.bias_zero.resize(N, 0.0f);
 
         switch (TransB) {
             case CblasNoTrans:
-                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             case CblasTrans:
-                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             default:
                 return false;
@@ -225,22 +234,29 @@ Return Value:
     size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                             : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-    if (M < m_step && N < n_step && !Data->BIsPacked) {
+    if ((M < m_step || N < n_step) && !Data->BIsPacked) {
         // Fallback to MLAS
         return false;
     }
 
-    std::vector<MLAS_SGEMM_DATA_PARAMS> KaiPackedData;
-    KaiPackedData.resize(BatchSize);
-
     size_t LhsPackedStride = 0;
     std::byte* LhsPackedData = nullptr;
 
     LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr);
-    auto LhsPacked = std::make_unique<std::byte[]>(LhsPackedStride * BatchSize);
-    LhsPackedData = LhsPacked.get();
 
-    std::unique_ptr<std::byte[]> RhsPacked{nullptr};
+    size_t lhs_resize = 0;
+    if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize))
+    {
+        // size_t wraparound detected for LhsPackedStride, fallback to MLAS
+        return false;
+    }
+
+    g_kai_tls.lhs_packed.resize(lhs_resize);
+    LhsPackedData = g_kai_tls.lhs_packed.data();
+
+    // RHS packed buffer: use TLS reusable vector to minimize allocations
+    size_t RhsPackedStride = 0;
+    std::byte* RhsPackedData = nullptr;
 
     // It is assumed all B batches require packing or not
     if (Data[0].BIsPacked) {
@@ -248,36 +264,31 @@ Return Value:
         MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) {
             std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
             kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-            KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
-            KaiPackedData[batch_idx].B = Data[batch_idx].B;
         });
     } else {
         // Multithread pack lhs and rhs
-        size_t RhsPackedStride = 0;
-        std::byte* RhsPackedData = nullptr;
-
         RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K);
-        RhsPacked = std::make_unique<std::byte[]>(RhsPackedStride * BatchSize);
-        RhsPackedData = RhsPacked.get();
+        size_t rhs_resize = 0;
+        if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize))
+        {
+            // size_t wraparound detected for RhsPackedStride, fallback to MLAS
+            return false;
+        }
+
+        g_kai_tls.rhs_packed.resize(rhs_resize);
+        RhsPackedData = g_kai_tls.rhs_packed.data();
 
         MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) {
-            // lhs odd, rhs even
             if (batch_idx & 0x1) {
                 batch_idx >>= 1;
-
                 std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
-
                 kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-
-                KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
             } else {
                 batch_idx >>= 1;
-
                 std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]);
-
-                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast<const float*>(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr);
-
-                KaiPackedData[batch_idx].B = reinterpret_cast<const float*>(RhsPackedPtr);
+                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K,
+                                           reinterpret_cast<const float*>(Data[batch_idx].B),
+                                           Data[batch_idx].ldb, RhsPackedPtr);
             }
         });
     }
@@ -303,6 +314,14 @@ Return Value:
     dim[1] = MlasDivRoundup(M, m_step);
     dim[2] = MlasDivRoundup(N, n_step);
 
+    // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop.
+    // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively.
+    size_t max_tile_elems = 0;
+    if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) {
+        // size_t wraparound detected for tile size, fallback to MLAS
+        return false;
+    }
+
     MlasTrySimpleParallel(ThreadPool, static_cast<ptrdiff_t>(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) {
         // compute B,M,N index from iteration index
         ptrdiff_t BIdx = tid / (dim[1] * dim[2]);
@@ -314,18 +333,18 @@ Return Value:
             UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K)
                     : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K);
 
-        auto BTile = reinterpret_cast<const void*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].B) + rhs_packed_offset
-        );
+        const std::byte* B_base = Data[0].BIsPacked
+            ? reinterpret_cast<const std::byte*>(Data[BIdx].B)
+            : (RhsPackedData + RhsPackedStride * BIdx);
+        auto BTile = reinterpret_cast<const void*>(B_base + rhs_packed_offset);
 
         // Get lhs tile, A
         const size_t lhs_packed_offset =
             UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K)
                     : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K);
 
-        auto ATile = reinterpret_cast<const float*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].A) + lhs_packed_offset
-        );
+        const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx;
+        auto ATile = reinterpret_cast<const float*>(A_base + lhs_packed_offset);
 
         auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step;
         auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step;
@@ -336,9 +355,14 @@ Return Value:
             MIdx * m_step * Data[BIdx].ldc * sizeof(float) +
             NIdx * n_step * sizeof(float)
         );
-        // Allocate temporary buffer for raw A*B result
-        std::vector<float> OutputTile(TileSizeM * TileSizeN, 0.0f);
-        float* temp_tile = OutputTile.data();
+        // Allocate temporary buffer for raw A*B result (TLS reusable buffer)
+        size_t tile_elems = TileSizeM * TileSizeN;
+
+        // resize the tile to the required size
+        g_kai_tls.output_tile.resize(tile_elems);
+
+        float* temp_tile = g_kai_tls.output_tile.data();
+        std::fill_n(temp_tile, tile_elems, 0.0f);
 
         if (UseSME2) {
             kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index a59347841be95..55f901164bdac 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) {
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2:
@@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
       mgx_type = migraphx_shape_fp8e5m2fnuz_type;
       break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1:
+      mgx_type = migraphx_shape_fp4x2_type;
+      break;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4:
       mgx_type = migraphx_shape_int8_type;
       break;
@@ -949,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "QLinearAdd",
                                                     "QLinearConv",
                                                     "QLinearMatMul",
+                                                    "QLinearAveragePool",
+                                                    "QLinearGlobalAveragePool",
                                                     "QuantizeLinear",
                                                     "QuickGelu",
                                                     "DynamicQuantizeLinear",
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 508d932459bf9..cd0c0e4bffdb5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
       trt_state->context->reset();
       trt_state->engine->reset();
+
+      // Clear dds output allocator map since the engine and context will be recreated.
+      dds_output_allocator_map.clear();
+
       auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
       if (max_workspace_size_ > 0) {
         trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h
index 315115390ff23..c4a88754deffe 100644
--- a/onnxruntime/core/providers/webgpu/compute_context.h
+++ b/onnxruntime/core/providers/webgpu/compute_context.h
@@ -8,6 +8,7 @@
 #include <utility>
 
 #include "core/framework/execution_provider.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
 
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -16,7 +17,6 @@
 namespace onnxruntime {
 
 class Tensor;
-class WebGpuExecutionProvider;
 
 namespace webgpu {
 
@@ -42,6 +42,9 @@ class ComputeContext {
   inline bool HasFeature(wgpu::FeatureName feature) const {
     return webgpu_context_.DeviceHasFeature(feature);
   }
+  inline bool IsGraphCaptureEnabled() const {
+    return ep_.IsGraphCaptureEnabled();
+  }
 #if !defined(__wasm__)
   inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const {
     return webgpu_context_.SubgroupMatrixConfigs();
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc
index 5998c22a0d2ca..aa1f6c9a0ec0b 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.cc
+++ b/onnxruntime/core/providers/webgpu/shader_variable.cc
@@ -378,7 +378,7 @@ std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std::
       ORT_THROW("Invalid type");
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Int64:
-      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));";
+      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));";
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Uint64:
       ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), 0u);";
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc
index 313a96ba25509..daf4aa323c12e 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc
@@ -11,75 +11,29 @@ namespace onnxruntime {
 namespace webgpu {
 
 namespace {
-const std::vector<MLDataType>& CastOpTypeConstraints() {
-  // currently support boolean, integer and float types that explicitly allowed in WGSL:
+const std::vector<MLDataType>& CastOpTypeConstraints(bool enable_graph_capture) {
+  // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL:
   // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section
-  //
-  static std::vector<MLDataType> types{
+  static std::vector<MLDataType> base_types{
       DataTypeImpl::GetTensorType<MLFloat16>(),
       DataTypeImpl::GetTensorType<float>(),
       DataTypeImpl::GetTensorType<int32_t>(),
       DataTypeImpl::GetTensorType<uint32_t>(),
       DataTypeImpl::GetTensorType<bool>()};
-  return types;
+
+  if (enable_graph_capture) {
+    static std::vector<MLDataType> types_with_int64 = []() {
+      auto types = base_types;
+      types.push_back(DataTypeImpl::GetTensorType<int64_t>());
+      return types;
+    }();
+    return types_with_int64;
+  } else {
+    return base_types;
+  }
 }
 }  // namespace
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    6, 8,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    9, 12,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    13, 18,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    19, 20,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    21, 22,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    23,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-
 Status Cast::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   auto* output_tensor = context.Output(0, input_tensor->Shape());
@@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const {
   if (size == 0) {
     return Status::OK();
   }
+  bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType<int64_t>();
+  const int in_components = is_from_int64 ? 1 : 4;
+  const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4;
   uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
+  uint32_t in_vec_size = onnxruntime::narrow<uint32_t>(in_components == 1 ? size : vec_size);
+  uint32_t out_vec_size = onnxruntime::narrow<uint32_t>(out_components == 1 ? size : vec_size);
 
-  CastProgram program{to_};
+  CastProgram program{to_, is_from_int64};
   program
-      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4})
-      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4})
+      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components})
+      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components})
       .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
           {static_cast<uint32_t>(vec_size)},
@@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
       expression = "vec4<bool>(a)";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      expression = "int32(a)";
+      break;
     default:
       ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported.");
   }
-  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")
-                        << "  let a = " << input.GetByOffset("global_idx") << ";\n  "
-                        << output.SetByOffset("global_idx", expression);
+
+  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
+  if (is_from_int64_) {
+    sh.MainFunctionBody() << "  let a0 = " << input.GetByOffset("global_idx * 4") << ";\n"
+                          << "  let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n"
+                          << "  let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n"
+                          << "  let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n"
+                          << "  let a = vec4<i32>(a0, a1, a2, a3);\n";
+  } else {
+    sh.MainFunctionBody() << "  let a = " << input.GetByOffset("global_idx") << ";\n";
+  }
+  if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n";
+  } else {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx", expression);
+  }
 
   return Status::OK();
 }
 
+template <int StartVersion, int EndVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) {
+  const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture);
+
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+    out = std::make_unique<Cast>(info);
+    return Status::OK();
+  };
+
+  if constexpr (StartVersion == EndVersion) {
+    // Non-versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  } else {
+    // Versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion, EndVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  }
+}
+
+// Explicit template instantiations
+template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool);
+template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool);
+template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool);
+template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool);
+template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool);
+template KernelCreateInfo CreateCastKernelInfo<23>(bool);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h
index 925cd200f0aba..7dfb50e3241c8 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.h
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "core/framework/kernel_registry.h"
+#include "core/framework/op_kernel.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
 namespace onnxruntime {
@@ -10,7 +12,7 @@ namespace webgpu {
 
 class CastProgram final : public Program<CastProgram> {
  public:
-  CastProgram(int32_t to) : Program{"Cast"}, to_{to} {}
+  CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -18,6 +20,7 @@ class CastProgram final : public Program<CastProgram> {
 
  private:
   int32_t to_;
+  bool is_from_int64_;
 };
 
 class Cast final : public WebGpuKernel {
@@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel {
   int32_t to_;
 };
 
+// Create Cast kernel info with appropriate type constraints based on graph capture support
+template <int StartVersion, int EndVersion = StartVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
index 7c3aced3f0295..cab1dc03848b9 100644
--- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
@@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const {
   data_dim += indices_innerest_dim_;
 
   for (uint32_t i = 0; i < static_cast<uint32_t>(data.Rank() - data_dim); i++) {
-    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
+    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
   }
 
   shader.MainFunctionBody() << "  " << output.SetByOffset("global_idx", data.GetByIndices("data_indices"));
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index bbb3fbdd221d3..0f7607ac1dbfe 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -28,6 +28,7 @@
 #include "core/providers/webgpu/data_transfer.h"
 #include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
+#include "core/providers/webgpu/tensor/cast.h"
 
 namespace onnxruntime {
 
@@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND);
 
-std::unique_ptr<KernelRegistry> RegisterKernels() {
+std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = false) {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -464,13 +465,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(13, Tanh),
       KERNEL_CREATE_INFO(1, Not),
 
-      KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast),
-      KERNEL_CREATE_INFO(23, Cast),
-
       // // activations
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, float, Clip)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, float, Clip)>,
@@ -771,6 +765,14 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
     }
   }
 
+  // Register Cast kernels with conditional int64 support based on graph capture
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture)));
+
 #ifndef DISABLE_CONTRIB_OPS
   Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry);
   ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage());
@@ -869,9 +871,13 @@ std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapa
 }
 
 std::shared_ptr<KernelRegistry> WebGpuExecutionProvider::GetKernelRegistry() const {
-  static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels();
-
-  return registry;
+  if (enable_graph_capture_) {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(true);
+    return registry;
+  } else {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false);
+    return registry;
+  }
 }
 
 std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTransfer() const {
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index baedb98a34c28..fbabc23504636 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type);
 // Collects all the initializer tensors in the subGraph and its ancestor graphs.
 InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
 
-inline std::vector<int64_t> HandleNegativeAxes(const std::vector<int64_t>& axes, size_t input_size) {
+inline std::vector<int64_t> HandleNegativeAxes(const gsl::span<const int64_t> axes, size_t input_size) {
   std::vector<int64_t> new_axes(axes.size());
   for (size_t i = 0; i < axes.size(); ++i) {
     new_axes[i] = HandleNegativeAxis(axes[i], input_size);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 0ea927967d989..5a80f01c17236 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&,
       std::vector<int64_t> c_shape;
       if (!GetShape(*input_defs[c_idx], c_shape, logger))
         return false;
-
-      size_t c_dim = c_shape.size();
-
-      if (c_dim > 1) {
-        // TODO: Supports other shape of C.
-        // Currently WebNN implementation in Chromium only supports 1-D C.
-        return false;
-      }
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm is a scalar";
-      } else {
-        auto c_size = c_shape[c_dim - 1];
-        NodeAttrHelper helper(node);
-        const auto transB = helper.Get("transB", 0);
-        if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-          LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                                << (transB == 0 ? "1" : "0") << "]"
-                                << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                                << " c_size: " << c_size;
-
-          return false;
-        }
-      }
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 6ea9b0a440d93..d07e636d578b1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -19,6 +19,8 @@ namespace webnn {
 class ReductionOpBuilder : public BaseOpBuilder {
   // Add operator related.
  public:
+  // Allow axes potentially being empty inputs that are ignored during processing.
+  ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   // Add operator related.
@@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() > 1) {
     model_builder.AddInitializerToSkip(input_defs[1]->Name());  // axes
+    model_builder.AddInputToSkip(input_defs[1]->Name());        // axes
   }
 }
 
@@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   NodeAttrHelper helper(node);
   const auto keep_dims = helper.Get("keepdims", 1);
+
   emscripten::val options = emscripten::val::object();
   options.set("label", node.Name());
   options.set("keepDimensions", keep_dims == 1);
-  std::vector<int32_t> axes_data;
-
-  emscripten::val output = emscripten::val::object();
 
+  std::vector<int64_t> axes_data;
   const auto opset = node.SinceVersion();
   const auto& op_type = node.OpType();
   if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) {
     // 'axes' is an optional input.
-    const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0);
-    if (!GetTensorName(input_defs, 1).empty()) {
-      // Optional input axes is provided, use axes initializer data.
-      const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
-      Initializer axes_initializer(axes_tensor);
-      const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
-      std::transform(
-          axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
-    } else {
-      if (noop_with_empty_axes) {
-        // When axes is empty and this attribute is set to true, input tensor will not be reduced.
-        output = input;
-        model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
-        return Status::OK();
+    std::vector<int64_t> axes_shape;
+    if (TensorExists(input_defs, 1)) {
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes");
+      if (axes_shape[0] != 0) {
+        // Optional input axes is provided and we already ensure it is an initializer.
+        // Use that initializer data.
+        const auto& initializers(model_builder.GetInitializerTensors());
+        const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
+        Initializer axes_initializer(axes_tensor);
+        const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
+        axes_data = HandleNegativeAxes(axes_data_span, input_rank);
       }
     }
   } else {
     if (helper.HasAttr("axes")) {
-      auto axes = helper.Get("axes", std::vector<int64_t>{});
-      std::transform(
-          axes.begin(), axes.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
+      axes_data = GetResolvedAxes(helper, input_rank);
     }
   }
-  if (axes_data.size() > 0) {
-    options.set("axes", emscripten::val::array(axes_data));
-  }
 
-  if (op_type == "ReduceL1") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL1", input, options);
-  } else if (op_type == "ReduceL2") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL2", input, options);
-  } else if (op_type == "ReduceLogSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSum", input, options);
-  } else if (op_type == "ReduceLogSumExp") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSumExp", input, options);
-  } else if (op_type == "ReduceMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMax", input, options);
-  } else if (op_type == "ReduceMean") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMean", input, options);
-  } else if (op_type == "ReduceMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMin", input, options);
-  } else if (op_type == "ReduceProd") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceProduct", input, options);
-  } else if (op_type == "ReduceSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", input, options);
-  } else if (op_type == "ReduceSumSquare") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSumSquare", input, options);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type);
+  // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute:
+  // - If it is false, perform reduction over all dimensions.
+  //   (In WebNN, this means the 'axes' option is not set.)
+  // - If it is true, no reduction is applied, but other operations are still performed.
+  //   (In WebNN, this requires setting 'axes' to an empty array.)
+  if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) {
+    options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64<uint32_t>(axes_data)));
   }
 
+  const std::string_view webnn_op_type = GetWebNNOpType(op_type);
+  ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type");
+
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>(
+      std::string(webnn_op_type).c_str(), input, options);
+
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
@@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                            const WebnnDeviceType /* device_type */,
                                            const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const std::string axes_name = GetTensorName(input_defs, 1);
-  // If the optional input 'axes' is provided, it must be an initializer.
-  if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) {
-    LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
-    return false;
+
+  if (TensorExists(input_defs, 1)) {
+    std::vector<int64_t> axes_shape;
+    if (!GetShape(*input_defs[1], axes_shape, logger)) {
+      LOGS(logger, VERBOSE) << "Cannot get shape of input axes";
+      return false;
+    }
+
+    if (axes_shape.size() != 1) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D";
+      return false;
+    }
+
+    const std::string axes_name = GetTensorName(input_defs, 1);
+    // If the optional input 'axes' is provided and not empty, it must be an initializer.
+    if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
+      return false;
+    }
   }
 
   return true;
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 4a72916d3e485..6912d19897d67 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -23,9 +23,9 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ not in ["10", "11"]:
+        if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
             warnings.warn(
-                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
             )
     elif __my_system__ == "linux":
         """Although the 'platform' python module for getting Distro information works well on standard OS images
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index d6b39a6b2aeb4..66ab0c44f8814 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
             for ref_o, o in zip(ref_output, output, strict=False):
                 # abs(desired-actual) < rtol * abs(desired) + atol
                 try:
-                    np.testing.assert_allclose(ref_o, o, rtol, atol)
+                    np.testing.assert_allclose(o, ref_o, rtol, atol)
                 except Exception as e:
                     if percentage_in_allowed_threshold(e, percent_mismatch):
                         continue
diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 574ec49da67ea..3bf37ea193245 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector<T>& data, const std::vector<int64_t>& s
 
   ORT_ENFORCE(static_cast<int64_t>(data.size()) == total_elements, "Data size does not match the shape",
               "Data size: ", data.size(), ", Expected size: ", total_elements,
-              ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name());
+              ", Shape: ", VectorToString(shape), " Name:", name);
 }
 
 // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 4fd9830440846..7371ad5cf0ded 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -2,13 +2,17 @@
 // Licensed under the MIT License.
 
 #include <iostream>
+#include <fstream>
 #include "core/common/inlined_containers.h"
 #include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/graph/op.h"
+#include "core/session/inference_session.h"
+#include "core/session/environment.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/test_environment.h"
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 #include "onnx/defs/function.h"
@@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener
 
 #endif
 
+// Test for shape inference with in-memory external data (issue #26261)
+// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes)
+// stored as in-memory external data would cause shape inference to fail
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) {
+  // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes)
+  // This will trigger the in-memory externalization path
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  // Add the value attribute with a tensor
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);  // 16 elements * 8 bytes = 128 bytes
+  // Each split will be size 1, totaling 16
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node that uses the constant as input
+  // Split requires constant input for the 'split' parameter, which triggers shape inference
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");  // Use constant as split sizes
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  // Add axis attribute
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load the model - this should succeed with the fix
+  // Before the fix, this would fail with:
+  // "Cannot parse data from external tensors. Please load external data into raw data for tensor"
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  // Verify the graph was properly constructed
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Verify the constant node was converted to an initializer
+  const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer));
+  ASSERT_NE(initializer, nullptr);
+
+  // Verify the Split node can access the constant data during shape inference
+  const Node* split_node_ptr = nullptr;
+  for (const auto& node : graph.Nodes()) {
+    if (node.Name() == "split_node") {
+      split_node_ptr = &node;
+      break;
+    }
+  }
+  ASSERT_NE(split_node_ptr, nullptr);
+
+  // Verify outputs are properly shaped
+  ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u);
+}
+
+// Test for shape inference with in-memory external data using InferenceSession
+// This test more accurately reproduces the issue by going through the full session initialization
+// which includes graph optimizations that trigger the in-memory externalization
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) {
+  // Create the same model as above
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Save to a temporary file
+  const std::string model_path = "test_in_memory_external_data.onnx";
+  {
+    std::ofstream file(model_path, std::ios::binary);
+    ASSERT_TRUE(file.is_open());
+    ASSERT_TRUE(model_proto.SerializeToOstream(&file));
+  }
+
+  // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::Default;  // This triggers the issue
+  so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession";
+
+  InferenceSession session_object{so, GetEnvironment()};
+
+  // This should succeed with the fix, fail without it
+  ASSERT_STATUS_OK(session_object.Load(model_path));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  // Clean up
+  std::remove(model_path.c_str());
+}
+
+// Test that explicitly triggers the in-memory externalization and then shape inference
+// This test directly reproduces the bug scenario
+TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) {
+  // Create a model with a Split node that depends on a constant initializer
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create initializer directly (not as Constant node) with 128 bytes
+  auto* initializer = graph_proto->add_initializer();
+  initializer->set_name("split_sizes");
+  initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  initializer->add_dims(16);  // 16 * 8 = 128 bytes
+  for (int64_t i = 0; i < 16; ++i) {
+    initializer->add_int64_data(1);
+  }
+
+  // Create a Split node that uses this initializer
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("split_sizes");  // Uses the large initializer
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load model
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  Graph& graph = model->MainGraph();
+  // First resolve should succeed
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Now trigger the in-memory externalization
+  // This converts initializers > 127 bytes to OrtValues with external data references
+  Status convert_status = graph.ConvertInitializersIntoOrtValues();
+  ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage();
+
+  // Check if the initializer was actually externalized
+  const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after));
+  ASSERT_NE(initializer_after, nullptr);
+  // Debug: verify it was externalized
+  ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after))
+      << "Initializer was not externalized to in-memory external data";
+
+  // Mark the graph as needing resolve to force shape inference to run again
+  graph.SetGraphResolveNeeded();
+
+  // Resolve again - this should trigger shape inference with the externalized initializer
+  // Without the fix, this will fail with "Cannot parse data from external tensors"
+  // With the fix, getInputData() materializes the external data for shape inference
+  Status second_resolve = graph.Resolve();
+  ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 6df98ff505fa1..cbb25bb9b629e 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1435,9 +1435,22 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"});
     // Fails since ONNX==1.19.0
     broken_tests->insert({"l2normalization_axis_0", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"});
     broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"});
   }
 
 #ifdef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index b6f2cb2683677..463634b370d4c 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     // Please make no more changes to the list
     static const ORTCHAR_T* immutable_broken_tests[] =
         {
-            // pending ONNX update
-            ORT_TSTR("attention_3d_gqa"),
-            ORT_TSTR("attention_3d_gqa_attn_mask"),
-            ORT_TSTR("attention_3d_gqa_causal"),
-            ORT_TSTR("attention_3d_gqa_scaled"),
-            ORT_TSTR("attention_3d_gqa_softcap"),
-            ORT_TSTR("attention_3d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_gqa"),
-            ORT_TSTR("attention_4d_gqa_attn_mask"),
-            ORT_TSTR("attention_4d_gqa_causal"),
-            ORT_TSTR("attention_4d_gqa_scaled"),
-            ORT_TSTR("attention_4d_gqa_softcap"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"),
-            // unsupported case
             ORT_TSTR("AvgPool1d"),
             ORT_TSTR("AvgPool1d_stride"),
             ORT_TSTR("AvgPool2d"),
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index cf49601e6c671..ca1a3104e0bed 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -678,7 +678,14 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
                                                      ORT_TSTR("fp16_coreml_FNS-Candy"),
                                                      ORT_TSTR("fp16_test_tiny_yolov2"),
                                                      ORT_TSTR("fp16_test_shufflenet"),
-                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
+                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"),
+                                                     // models from model zoo. #26274: cuDNN frontend no valid engine
+                                                     ORT_TSTR("YOLOv3"),
+                                                     ORT_TSTR("YOLOv3-12"),
+                                                     ORT_TSTR("YOLOv4"),
+                                                     ORT_TSTR("SSD-MobilenetV1"),
+                                                     ORT_TSTR("SSD-MobilenetV1-12")};
+
   // For ROCm EP, also disable the following tests due to flakiness,
   // mainly with precision issue and random memory access fault.
   static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
index 8f2eac2d05792..ed67b531ef394 100644
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -1480,7 +1480,7 @@ template <typename F4>
 void CastOpTestFloatFloat4(std::vector<int64_t> shape,
                            std::vector<float> float_data,
                            bool is_fp4_input = false) {
-  size_t num_pairs = float_data.size() / 2;
+  int num_pairs = static_cast<int>(float_data.size()) / 2;
   int num_fp4_elements = static_cast<int>((float_data.size() + 1) / 2);
   bool is_odd_count = (float_data.size() % 2 != 0);
 
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 327dfab96c2d1..a746493d779f8 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   ASSERT_TRUE(status.IsOK());
 }
 
+TEST(TensorrtExecutionProviderTest, DDSOutputTest) {
+  PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx");
+  SessionOptions so;
+  so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_op_x = {3, 4};
+  std::vector<float> values_op_x(12, 0.f);  // 12=3*4
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("output");
+  std::vector<OrtValue> fetches;
+
+  OrtTensorRTProviderOptionsV2 params;
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  // First pass run
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+
+  // Second pass run with new shape
+  dims_op_x = {6, 4};
+  values_op_x.resize(24, 0.f);  // 24=6*4
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+  feeds.clear();
+
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+}
+
 TEST_P(TensorrtExecutionProviderCacheTest, Run) {
   // GetParam() returns the parameter of following format:
   // ##cache type##_##input shape type##
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 72c6a5664f395..d2e9557f633b0 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None
         """
 
         def assert_similar_array(ref_output, output):
-            np.testing.assert_equal(ref_output.dtype, output.dtype)
+            np.testing.assert_equal(output.dtype, ref_output.dtype)
             if ref_output.dtype == object:
-                np.testing.assert_array_equal(ref_output, output)
+                np.testing.assert_array_equal(output, ref_output)
             else:
-                np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol)
+                np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol)
 
-        np.testing.assert_equal(len(ref_outputs), len(outputs))
+        np.testing.assert_equal(len(outputs), len(ref_outputs))
         for i in range(len(outputs)):  # pylint: disable=consider-using-enumerate
             if isinstance(outputs[i], list):
                 for j in range(len(outputs[i])):
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e44adcdb9827f..7f003453add89 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -54,7 +54,7 @@ def run_model(self, session_object, run_options):
         input_name = session_object.get_inputs()[0].name
         res = session_object.run([], {input_name: x}, run_options=run_options)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue):
         for _ in range(iter_num):
@@ -714,7 +714,7 @@ def test_run_model(self):
 
         res = sess.run([outputs[0].name], {inputs[0].name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_async(self):
         event = threading.Event()
@@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None:
             self.assertEqual(len(err), 0)
             self.assertEqual(len(res), 1)
             self.assertEqual(data.get_id(), 123456)
-            np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+            np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
             event.set()
 
         so = onnxrt.SessionOptions()
@@ -762,7 +762,7 @@ def test_run_model_from_bytes(self):
         self.assertEqual(output_shape, [3, 2])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -777,7 +777,7 @@ def test_run_model2(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2_contiguous(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -792,10 +792,10 @@ def test_run_model2_contiguous(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         xcontiguous = np.ascontiguousarray(x)
         rescontiguous = sess.run([output_name], {input_name: xcontiguous})
-        np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_multiple_threads(self):
         # Skip this test for a "pure" DML onnxruntime python wheel.
@@ -860,14 +860,14 @@ def test_list_as_input(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x.tolist()})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_string_list_as_input(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
         x_name = sess.get_inputs()[0].name
         res = sess.run([], {x_name: x.tolist()})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_run_device(self):
         device = onnxrt.get_device()
@@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self):
         self.assertEqual(output_shape, ["None", 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_boolean_inputs(self):
         sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers)
@@ -920,7 +920,7 @@ def test_boolean_inputs(self):
 
         output_expected = np.array([[True, False], [False, False]], dtype=bool)
         res = sess.run([output_name], {a_name: a, b_name: b})
-        np.testing.assert_equal(output_expected, res[0])
+        np.testing.assert_equal(res[0], output_expected)
 
     def test_string_input1(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -941,7 +941,7 @@ def test_string_input1(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_string_input2(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -962,7 +962,7 @@ def test_string_input2(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_bytes(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -983,7 +983,7 @@ def test_input_bytes(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0].astype("|S8"))
+        np.testing.assert_equal(res[0].astype("|S8"), x)
 
     def test_input_object(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1004,7 +1004,7 @@ def test_input_object(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_void(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1029,7 +1029,7 @@ def test_input_void(self):
         res = sess.run([output_name], {x_name: x})
 
         expr = np.array([["must", "have"], ["same", "size"]], dtype=object)
-        np.testing.assert_equal(expr, res[0])
+        np.testing.assert_equal(res[0], expr)
 
     def test_raise_wrong_num_inputs(self):
         with self.assertRaises(ValueError) as context:
@@ -1164,7 +1164,7 @@ def test_sequence_construct(self):
             },
         )
 
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_sequence_insert(self):
         opt = onnxrt.SessionOptions()
@@ -1194,7 +1194,7 @@ def test_sequence_insert(self):
                 "input_seq": [],
             },
         )
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_ort_execution_mode(self):
         opt = onnxrt.SessionOptions()
@@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self):
         input_1 = np.zeros((3, 5)).astype(np.float32)
         res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1})
         output_expected = np.ones((3, 5)).astype(np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Create an alias of SessionOptions instance
         # We will use this alias to construct another InferenceSession
@@ -1969,7 +1969,7 @@ def test_adater_export_read(self):
             self.assertTrue(value.is_tensor())
             self.assertEqual(expected_val.element_type(), value.element_type())
             self.assertEqual(expected_val.shape(), value.shape())
-            np.testing.assert_allclose(expected_val.numpy(), value.numpy())
+            np.testing.assert_allclose(value.numpy(), expected_val.numpy())
 
     def test_run_with_adapter(self):
         model_path = get_name("lora/two_params_lora_model.onnx")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index d66951bd66f3d..a24269a312e9b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -146,7 +146,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self):
         gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57)
         # copy back to CPU
         cpu_data = gpu_value.numpy()
-        np.testing.assert_equal(data, cpu_data)
+        np.testing.assert_equal(cpu_data, data)
 
         gpu_value.update_inplace(data2)  # update the fake GPU data
         cpu_data_2 = gpu_value.numpy()  # copy back to CPU
-        np.testing.assert_equal(data2, cpu_data_2)
+        np.testing.assert_equal(cpu_data_2, data2)
 
         gpu_value = None  # Delete OrtValue before unregistering library as the allocator will be destroyed.
 
@@ -336,8 +336,8 @@ def test_copy_tensors(self):
         del b_device
 
         # Verify the contents
-        np.testing.assert_array_equal(a, a_cpu_copy.numpy())
-        np.testing.assert_array_equal(b, b_cpu_copy.numpy())
+        np.testing.assert_array_equal(a_cpu_copy.numpy(), a)
+        np.testing.assert_array_equal(b_cpu_copy.numpy(), b)
 
         self.unregister_execution_provider_library(ep_name)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 6ed7dfe59b1f6..416d9b6edecd1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -19,7 +19,7 @@ def test_run_model(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index c245699e211d4..9e3c1acbc923b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
     for i in range(num_rows):
         # use np.testing.assert_allclose so we can specify the tolerance
         np.testing.assert_allclose(
-            [expected_rows[i][key] for key in sorted_keys],
             [actual_rows[i][key] for key in sorted_keys],
+            [expected_rows[i][key] for key in sorted_keys],
             rtol=1e-05,
             atol=1e-07,
         )
@@ -37,7 +37,7 @@ def test_run_model_non_tensor(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = rep.run(x)
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_proto(self):
         name = datasets.get_example("logreg_iris.onnx")
@@ -47,7 +47,7 @@ def test_run_model_proto(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
@@ -72,7 +72,7 @@ def test_run_model_proto_api(self):
         outputs = ort_backend.run_model(model, inputs)
 
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index 5ab2fe8939f6a..d6c1dd9cff3f3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_cuda_graph(self):
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
@@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers):
         # One regular run for the necessary memory allocation and cuda graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, CUDA graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay CUDA graph
         x_ortvalue.update_inplace(
@@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
     def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
@@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self):
             session.run_with_iobinding(io_binding)
             output = cuda_graph_helper.get_output("softmaxout_1")
 
-            np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02)
+            np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
index 033eae1cb4c8d..4a6aa7b63d9c3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_dml_graph(self):
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
@@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers):
         # One regular run for the necessary memory allocation and dml graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, DML graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay DML graph
         x_ortvalue.update_inplace(
@@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index 8b6b029c57752..70b8c0fc0b980 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -80,7 +80,7 @@ def test_dict_vectorizer(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         xwrong = x.copy()
         xwrong["a"] = 5.6
@@ -96,17 +96,17 @@ def test_dict_vectorizer(self):
         x = {np.int64(k): np.float32(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int64(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int32(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_label_encoder(self):
         sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
@@ -127,18 +127,18 @@ def test_label_encoder(self):
         x = np.array([["4"]])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[3]], dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Python type
         x = np.array(["4"], ndmin=2)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = np.array(["4"], ndmin=2, dtype=object)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_mlnet(self):
         available_providers = onnxrt.get_available_providers()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
index d5c80a4a1f4ba..034f0288e2508 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
@@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         """
@@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self):
         """
@@ -152,7 +152,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_bind_input_only(self):
         """
diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py
index bea110e566fb9..f02f4da4eb0fb 100644
--- a/onnxruntime/test/python/quantization/test_fusions.py
+++ b/onnxruntime/test/python/quantization/test_fusions.py
@@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e
         for idx, expected_output in enumerate(orig_results):
             actual_output = fused_results[idx]
             np.testing.assert_allclose(
-                expected_output,
                 actual_output,
+                expected_output,
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"Fused model output {idx} differs",
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index 5d70641547eae..20b40fc157c16 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self):
             for expected, actual in zip(model_outputs, test_outputs, strict=False):
                 exp = expected.reshape(-1)
                 act = actual.reshape(-1)
-                np.testing.assert_equal(exp, act)
+                np.testing.assert_equal(act, exp)
 
     def test_create_activation_matching_present(self):
         float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx")
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
index a8f7591186766..906bf7aab8698 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
@@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self):
                         matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type)
                         quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type)
                         quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type)
-                        np.testing.assert_allclose(quant_value_ref, quant_value)
-                        np.testing.assert_allclose(absmax_ref, absmax)
+                        np.testing.assert_allclose(quant_value, quant_value_ref)
+                        np.testing.assert_allclose(absmax, absmax_ref)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
index 7916d93c3e531..1dedc475c9962 100644
--- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
@@ -40,7 +40,7 @@ def test_basic_all(self):
         x = np.arange(2**2).reshape((2,) * 2).astype(np.float32)
         t = np.arange(8).reshape((2, 4)).astype(np.float32)
         got = sess.run(None, {"X": x})[0]
-        np.testing.assert_allclose(t, got, atol=1e-5)
+        np.testing.assert_allclose(got, t, atol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index f5f6a3ae3bc39..0558d008a2275 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -31,23 +31,12 @@
     "current_failing_tests": [
         "^test_adagrad",
         "^test_adagrad_multiple",
-        "^test_attention_4d_diff_heads_mask4d_padded_kv*",  // pending onnx update
-        "^test_attention_3d_gqa*",  // pending onnx update
-        "^test_attention_3d_gqa_causal",  // pending onnx update
-        "^test_attention_3d_gqa_scaled",  // pending onnx update
-        "^test_attention_3d_gqa_softcap",  // pending onnx update
-        "^test_attention_3d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_4d_gqa*",  // pending onnx update
-        "^test_attention_4d_gqa_causal",  // pending onnx update
-        "^test_attention_4d_gqa_scaled",  // pending onnx update
-        "^test_attention_4d_gqa_softcap",  // pending onnx update
-        "^test_attention_4d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_*causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_attn_mask_3d_causal_expanded*",  // pending onnx update
         "^test_attention_4d_fp16*",  // precision issue: 1 / 192 mismatched elements
         "^test_attention_4d_fp16_expanded*",  // precision issue: 3 / 192 mismatched elements
+        "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*",  // webgpu mismatched elements 38 / 576
+        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*",  // webgpu
+        "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu
+        "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen
         "^test_l2normalization*",  // LpNormalization(22) not implemented
         "^test_l1normalization*",  // LpNormalization(22) not implemented
         "^test_lpnormalization*",  // LpNormalization(22) not implemented
@@ -123,13 +112,9 @@
         "^test_if_opt",
         "^test_loop16_seq_none",
         "^test_identity_opt",
-        // rotary dim should be fixed in onnx==1.19.1
-        "^test_rotary_embedding_no_position_ids_rotary_dim",
-        "^test_rotary_embedding_with_interleaved_rotary_dim",
-        "^test_rotary_embedding_with_rotary_dim",
-        "^test_rotary_embedding_3d_input_expanded",
-        "^test_rotary_embedding_interleaved_expanded",
-        "^test_rotary_embedding_no_position_ids_interleaved_expanded",
+        "^test_rotary_embedding_3d_input_expanded",  // win cuda fail
+        "^test_rotary_embedding_interleaved_expanded",  // win cuda fail
+        "^test_rotary_embedding_no_position_ids_interleaved_expanded",  // win cuda fail
         "^test_rotary_embedding_expanded", //webgpu
         "^test_rotary_embedding_no_position_ids_expanded", //webgpu
         // Following tests are for opset 16 ops and are not yet implemented in ORT
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py
new file mode 100644
index 0000000000000..fa381e5df1094
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272.py
@@ -0,0 +1,26 @@
+import onnx
+from onnx import TensorProto, helper
+
+# Create a simple ONNX model with DDS output
+input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
+output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])
+
+nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
+transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node")
+gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node")
+
+value_info = [
+    helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
+    helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
+]
+
+graph = helper.make_graph(
+    [nonzeros_node, transpose_node, gathernd_node],
+    "test_graph",
+    [input],
+    [output],
+    value_info=value_info,
+)
+
+model = helper.make_model(graph)
+onnx.save(model, "ort_github_issue_26272_dds.onnx")
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
new file mode 100644
index 0000000000000..371f99c537898
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
@@ -0,0 +1,28 @@
+
+:�
+(
+datanonzerosnonzeros_node"NonZero
+1
+nonzeros
+nonzeros_ttranspose_node"	Transpose
+3
+data
+
+nonzeros_toutputgathernd_node"GatherND
+test_graphZ
+data
+
+d1
+d2b
+output
+
+nzrj
+nonzeros
+
+
+nzrj
+
+nonzeros_t
+
+nzr
+B
\ No newline at end of file
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 327caf83c7850..591be538ac873 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1015,6 +1015,9 @@ def generate_build_tree(
     if path_to_protoc_exe:
         cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"]
 
+    if args.cmake_deps_mirror_dir:
+        cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"]
+
     if args.fuzz_testing:
         if not (
             args.build_shared_lib
@@ -1330,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
             cmd_args.extend(["--target", *targets])
 
         build_tool_args = []
-        if num_parallel_jobs != 1:
+        if num_parallel_jobs != 0:
             if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
                 # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
                 # not maxing out CL_MPCount
@@ -1748,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 # Install cpu only version of torch when cuda is not enabled in Linux.
                 extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
                 run_subprocess(
-                    [sys.executable, "-m", "pip", "install", "torch", *extra],
+                    [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra],
                     cwd=cwd,
                     dll_path=dll_path,
                     python_path=python_path,
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index c5454903474d1..05d5052067b2e 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None:
         help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines",
     )
     parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.")
+    parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.")
     parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.")
     parser.add_argument(
         "--enable_symbolic_shape_infer_tests",
diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 53b62762319ba..e54216fe4ef4e 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -31,5 +31,5 @@ stages:
         machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU'
         extra_build_arg: ''
         cmake_build_type: Release
-        cuda_version: 12.2
+        cuda_version: 12.8
         docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 91736752e22d4..086d65c93062b 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -73,12 +73,12 @@ variables:
 - name: ReleaseVersionSuffix
   value: ''
 - name: win_trt_version
-  value: 12.2
+  value: 12.8
 
 - name: win_trt_home
   value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
-  value: $(Agent.TempDirectory)\v12.2
+  value: $(Agent.TempDirectory)\v12.8
 extends:
   # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
   # For non-production pipelines, use "Unofficial" as defined below.
@@ -142,7 +142,7 @@ extends:
 
     - template: stages/nuget-combine-cuda-stage.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
         win_trt_home: ${{ variables.win_trt_home }}
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 46363c07b3e3e..7e107c33ed8c0 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -127,7 +127,7 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_win.yml
   parameters:
@@ -136,7 +136,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -144,7 +144,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -153,7 +153,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 
 
@@ -202,7 +202,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
@@ -257,7 +257,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 5535d7b4f264d..d7fc0efbf45ea 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -48,9 +48,9 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-  - 12.2
+  - 12.8
 
 variables:
 - template: templates/common-variables.yml
@@ -59,13 +59,13 @@ variables:
 - name: win_trt_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }}
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\v11.8
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
-    value: $(Agent.TempDirectory)\v12.2
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
+    value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index 1ad6f411d9848..5ce6ec278b1e7 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 - name: QnnSdk
   displayName: QNN SDK Version
@@ -40,8 +40,8 @@ variables:
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
@@ -178,9 +178,6 @@ extends:
             inputs:
               targetType: 'inline'
               script: |
-                mkdir -p $(Build.BinariesDirectory)/osx-x64
-                Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64
-
                 mkdir -p $(Build.BinariesDirectory)/osx-arm64
                 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64
 
@@ -200,12 +197,6 @@ extends:
                 foreach ($dir in $dirs) {
                   Write-Host "Directory: $($dir.FullName)"
                 }
-                $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*
-                if ($osx_x64_archive.Count -eq 0) {
-                  Write-Host "No osx-x64 archive found."
-                } else {
-                  Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)"
-                }
                 $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*
                 if ($osx_arm64_archive.Count -eq 0) {
                   Write-Host "No osx-arm64 archive found."
@@ -233,13 +224,10 @@ extends:
               script: |
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
-                $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
-                tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null
                 tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null
                 $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName
                 $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName
-                $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
                 Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
                 Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
index d387c07d6dc6e..463c02203e21a 100644
--- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
+++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
@@ -40,7 +40,7 @@ stages:
 
     - template: templates/jobs/download_win_gpu_library.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         DownloadCUDA: true
         DownloadTRT: true
 
@@ -105,7 +105,7 @@ stages:
     - name: runCodesignValidationInjection
       value: false
     - name: docker_base_image
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
     timeoutInMinutes: 60
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index 0410001d77d13..5e6671e3797ce 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,21 +31,21 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index 89ce3f3c86727..b60ef7576184e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -58,9 +58,9 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
           Context: tools/ci_build/github/linux/docker/
-          ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          ${{ if eq(parameters.CudaVersion, '12.8') }}:
             DockerBuildArgs: "
-            --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
+            --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
             --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }}
             --build-arg BUILD_UID=$( id -u )
             "
@@ -107,4 +107,4 @@ stages:
           DisableContribOps: $(DisableContribOps)
           DisableMlOps: $(DisableMlOps)
           IsReleaseBuild: $(IsReleaseBuild)
-          PACKAGENAME: ${{ parameters.NugetPackageName }}
\ No newline at end of file
+          PACKAGENAME: ${{ parameters.NugetPackageName }}
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index deb8b84bf19b8..fdfafd4d9a179 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -2,16 +2,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 stages:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index c2c89686a077e..02b6a6df76611 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,8 +18,8 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
-          cuda_version: '12.2'
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
+          cuda_version: '12.8'
 
   - stage: Republish_Wheels
     dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index 4c536bad45368..290af4a3e4449 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -49,4 +49,4 @@ extends:
       - template: stages/py-gpu-packaging-stage.yml
         parameters:
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          cuda_version: '12.2'
+          cuda_version: '12.8'
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 858de4d173484..b53aee639372d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -2,9 +2,9 @@ parameters:
   - name: CudaVersion
     displayName: 'CUDA version'
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
   - name: machine_pool
     type: string
 
@@ -44,13 +44,13 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     pool: ${{ parameters.machine_pool }}
     steps:
@@ -105,4 +105,4 @@ jobs:
         inputs:
           targetType: filePath
           filePath: tools/ci_build/github/linux/run_python_dockertest.sh
-          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2
+          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8
diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
index bca95a4a2fd02..8cbb81ba89c12 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 stages:
 - stage: Linux_Nodejs_Packaging_x64
@@ -20,14 +20,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 121e80fca1021..b1e5f541b90e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 - name: buildJava
   type: boolean
 - name: buildNodejs
@@ -22,7 +22,7 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
@@ -74,14 +74,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
@@ -140,12 +140,12 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index d3d4b8f5b64d5..3c5cf591039e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -19,9 +19,9 @@ parameters:
 - name: cuda_version
   type: string
   displayName: 'CUDA version. Windows Only.'
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 - name: PythonVersions
   type: object
@@ -48,4 +48,4 @@ stages:
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
index 715470eb9f012..ab1fb919af413 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
@@ -22,9 +22,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 stages:
 - stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
@@ -55,7 +55,7 @@ stages:
       - name: trt_version
         ${{ if eq(parameters.cuda_version, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.cuda_version, '12.2') }}:
+        ${{ if eq(parameters.cuda_version, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index e2683c04f21f2..c3957fc8341de 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -20,9 +20,9 @@ parameters:
   default: ''
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 - name: cmake_build_type
   type: string
@@ -47,7 +47,7 @@ stages:
       workspace:
         clean: all
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
       templateContext:
         sdl:
@@ -76,7 +76,7 @@ stages:
       - name: win_trt_folder
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.win_trt_folder_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.win_trt_folder_cuda12 }}
       - name: trt_build_flag
         ${{ if eq(parameters.use_tensorrt, true) }}:
@@ -119,7 +119,7 @@ stages:
             --cmake_generator "$(VSGenerator)"
             --enable_pybind
             --enable_onnx_tests
-            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40
+            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build 
             $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
           workingDirectory: '$(Build.BinariesDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 681138a5ab3d1..be213337091e8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -7,10 +7,10 @@ parameters:
     default: false
   - name: CudaVersion
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
       - 11.8
-      - 12.2
+      - 12.8
   - name: TrtVersion
     type: string
     default: '10.9.0.34'
@@ -46,11 +46,11 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 96436883fb8b8..d7c940cda30f4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -9,7 +9,7 @@ parameters:
     default: false
   - name: PrimaryCUDAVersion
     type: string
-    default: '12.2'
+    default: '12.8'
 #  - name: SecondaryCUDAVersion
 #    type: string
 #    default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 1415586521f30..263f73a9e29b0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,9 +18,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 # TODO: Ideally it should fetch information from the build that triggers it
 - name: cmake_build_type
@@ -46,7 +46,7 @@ jobs:
   - name: trt_version
     ${{ if eq(parameters.cuda_version, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
+    ${{ if eq(parameters.cuda_version, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
   workspace:
     clean: all
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 0310735d94b2e..ca698123a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -78,7 +78,7 @@ parameters:
   default: '11.8'
   values:
       - 11.8
-      - 12.2
+      - 12.8
 
 - name: SpecificArtifact
   displayName: Use Specific Artifact
@@ -136,7 +136,7 @@ stages:
 
     ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
     ${{ else }}:
       pool:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
index c20f4a2c1bd19..8b320b0ceb4ac 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
@@ -32,10 +32,10 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
     - 11.8
-    - 12.2
+    - 12.8
 
 stages:
 - stage: kernelDocumentation
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index c12bb3552920c..08953749f6527 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,16 +31,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 2a65e7c26b20b..a277286866e41 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index 3337af3be6074..5410bd64036ce 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 ARG ROCM_VERSION=6.2.3
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
index 0007a4e06f7c0..07ad8e933baf0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 8a84b9b940306..5d98c25b535af 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
 ARG TRT_VERSION=10.9.0.34-1+cuda12.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index 8b2083c2ccfc1..cef2d11780969 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index f5143d5ac9ab9..79d99d08dcc4e 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index bae6f4cb51816..1b1dadeaf8db2 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index cfc2ce7079148..72d98206f9205 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index 8401393a661b1..85f4a074e30bf 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2
 FROM $BASEIMAGE
 ARG TRT_VERSION
 
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index b923febc1227f..81ba47f397f91 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
index f3341f32a768d..5ad1023bfb5b2 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -1,5 +1,5 @@
 # Use the specified UBI8 base image with GCC 14
-ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1"
+ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2"
 FROM ${BASEIMAGE}
 
 ARG BUILD_UID=1000
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 2871f5cab2ea2..dc394ff50f4f9 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.19.0
+onnx==1.19.1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 381d42831e715..2d89aece56340 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 4cc94f9148656..c19c0170291e6 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -4,11 +4,11 @@ mypy
 pytest
 setuptools==78.1.1
 wheel==0.45.1
-onnx==1.19.0
+onnx==1.19.1
 argparse
 sympy==1.14
 flatbuffers
 protobuf==4.25.1
 packaging
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index d48fb66194f2a..3ddce9cc0ec31 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -3,12 +3,12 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 6ab2ab2b7b61f..bb307a20d7f18 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -3,13 +3,13 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
 semver
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index f93938e2a9009..f095f58f9920e 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,13 +1,13 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index ecadab5d3f8a3..115a19b6f3a01 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -1,14 +1,14 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 45e0d970fb541..6110249a9cde6 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,10 +1,10 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index bcd5a434c58e8..21894c2ba003d 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -3,12 +3,13 @@ packaging
 # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
 protobuf==4.25.1
 numpy==2.2.6
-torch>=2.6.0
+torch==2.8.0
+torchvision==0.23.0
 coloredlogs==15.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece
 psutil
 einops
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10