diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml index 42d1bdc295785..683c5594e82f2 100644 --- a/.github/workflows/publish-csharp-apidocs.yml +++ b/.github/workflows/publish-csharp-apidocs.yml @@ -20,7 +20,7 @@ permissions: jobs: build: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] env: DOCFXVERSION: 2.62.2 steps: diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 437fc0e2c6334..3d24d4b6b75b6 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU CUDA CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -52,9 +52,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - uses: actions/setup-node@v5 with: @@ -111,7 +111,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -188,10 +188,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -199,9 +199,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - name: Set OnnxRuntimeBuildDirectory shell: pwsh @@ -227,7 +227,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml index 395ccfbe70244..b608c0879aa45 100644 --- a/.github/workflows/windows_openvino.yml +++ b/.github/workflows/windows_openvino.yml @@ -18,7 +18,7 @@ concurrency: jobs: BUILD_OPENVINO_EP: name: Windows OpenVINO CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 240 env: AZCOPY_AUTO_LOGIN_TYPE: MSI diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml index 9788792b94fa8..1906fcb18c841 100644 --- a/.github/workflows/windows_qnn_x64.yml +++ b/.github/workflows/windows_qnn_x64.yml @@ -18,7 +18,7 @@ concurrency: jobs: build_test_qnn_ep: name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }}) - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 120 strategy: matrix: diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 5f3dcb9607a47..2a1fe97d9b7b7 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU TensorRT CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -56,9 +56,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - uses: actions/setup-node@v5 @@ -116,7 +116,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -193,10 +193,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -208,9 +208,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - name: Set OnnxRuntimeBuildDirectory @@ -237,7 +237,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml index 6165375e7a54a..6a1b43e54ed89 100644 --- a/.github/workflows/windows_x64_debug_build_x64_debug.yml +++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_debug: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml index f9d7b0d9e9e04..0bcd282e8dc50 100644 --- a/.github/workflows/windows_x64_release_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml index 54c13e1e04b0a..3934047266f59 100644 --- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml +++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_ep_generic_interface: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml index 06230962b39be..1c38d8e58970c 100644 --- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_vitisai: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml index 21033ef4cbe3c..6eb9f00d3997d 100644 --- a/.github/workflows/windows_x64_release_xnnpack.yml +++ b/.github/workflows/windows_x64_release_xnnpack.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_xnnpack: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml index fa1e9362e2f34..597c1c7f4b6cf 100644 --- a/.github/workflows/windows_x86.yml +++ b/.github/workflows/windows_x86.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x86_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 116d369885a27..8186da507a442 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS) endif() -# Single output director for all binaries +# Single output directory for all binaries set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") +# Local mirror directory of cmake dependencies +set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) +set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies") + include(FetchContent) @@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD) set(onnxruntime_MINIMAL_BUILD ON) endif() -set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime) set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining) set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining) diff --git a/cmake/deps.txt b/cmake/deps.txt index 7b243ff15cd80..bf76753c1b3c0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 -onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1 +onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09 # Use the latest commit of 10.9-GA onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake index 55059b9500a8e..e8044411e4201 100644 --- a/cmake/external/helper_functions.cmake +++ b/cmake/external/helper_functions.cmake @@ -4,11 +4,11 @@ # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects. function(onnxruntime_fetchcontent_declare contentName) + cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "") + message(STATUS "Fetch ${contentName} from ${ARG_URL}") FetchContent_Declare(${ARGV}) string(TOLOWER ${contentName} contentNameLower) - list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR) - if(index_SOURCE_SUBDIR GREATER_EQUAL 0) - cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") + if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "") set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE) endif() endfunction() diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 8e1a880579b34..b6a741d8b0fe7 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST) if(ONNXRUNTIME_DEP_URL MATCHES "^https://") # Search a local mirror folder - string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") + string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") if(EXISTS "${LOCAL_URL}") cmake_path(ABSOLUTE_PATH LOCAL_URL) @@ -498,13 +498,7 @@ else() endif() if(Patch_FOUND) - set(ONNXRUNTIME_ONNX_PATCH_COMMAND - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch && - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < - ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch - ) + set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch) else() set(ONNXRUNTIME_ONNX_PATCH_COMMAND "") endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 68a3e9014b7b0..1d31eb1fbd207 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -182,8 +182,8 @@ # Since CUDA 12.8, compiling diagnostics become stricter if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - target_compile_options(${target} PRIVATE "$<$:--relocatable-device-code=true>") - set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_options(${target} PRIVATE "$<$:--static-global-template-stub=false>") + if (MSVC) target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4505>") endif() diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/patches/onnx/onnx.patch +++ b/cmake/patches/onnx/onnx.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/vcpkg-ports/onnx/binskim.patch +++ b/cmake/vcpkg-ports/onnx/binskim.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake index 27f5ea5fadd79..882850963a0c0 100644 --- a/cmake/vcpkg-ports/onnx/portfile.cmake +++ b/cmake/vcpkg-ports/onnx/portfile.cmake @@ -4,12 +4,9 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO onnx/onnx REF "v${VERSION}" - SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0 + SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60 PATCHES fix-cmakelists.patch - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - avoid_regenerating_proto_files.patch fix-dependency-protobuf.patch binskim.patch ) diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json index 350db2e35061a..ad0d1aaf15f51 100644 --- a/cmake/vcpkg-ports/onnx/vcpkg.json +++ b/cmake/vcpkg-ports/onnx/vcpkg.json @@ -1,6 +1,6 @@ { "name": "onnx", - "version-semver": "1.19.0", + "version-semver": "1.19.1", "port-version": 1, "description": "Open standard for machine learning interoperability", "homepage": "https://onnx.ai", diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md index 8da19ddc51cb7..8c1280431c384 100644 --- a/docs/How_To_Update_ONNX_Dev_Notes.md +++ b/docs/How_To_Update_ONNX_Dev_Notes.md @@ -35,7 +35,7 @@ git add onnx 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch). 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version. 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.) -4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl +4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl Alternatively, directly run Terrapin to upload ONNX package (need SHA512): diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 295aacc6fffa3..ea88f291e5597 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s | GatherElements | ai.onnx(11-12, 13+) | gatherElements | | | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 | | Gelu | ai.onnx(20+) | gelu | | -| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input | +| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | | | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input | | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input | | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 | diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json index 48f0a8f3e9d5c..e880f6bca2ac4 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json @@ -12,7 +12,7 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } }, "node_modules/@babel/helper-string-parser": { @@ -1114,9 +1114,9 @@ } }, "node_modules/vite": { - "version": "6.3.5", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz", - "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==", + "version": "6.3.6", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz", + "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==", "dev": true, "license": "MIT", "dependencies": { diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json index f7d5751354905..84013e2aecb88 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package.json @@ -13,6 +13,6 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } } diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 3f1face2a043c..80991a3ebbb5f 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -2147,66 +2147,66 @@ "test_reduce_log_sum_default", "test_reduce_log_sum_desc_axes", // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64. - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", "test_reduce_log_sum_negative_axes", "test_reduce_log_sum", "test_reduce_max_default_axes_keepdim_example", - // "test_reduce_max_default_axes_keepdims_random", - // "test_reduce_max_do_not_keepdims_example", - // "test_reduce_max_do_not_keepdims_random", - // "test_reduce_max_keepdims_example", - // "test_reduce_max_keepdims_random", - // "test_reduce_max_negative_axes_keepdims_example", - // "test_reduce_max_negative_axes_keepdims_random", - // "test_reduce_mean_default_axes_keepdims_example", - // "test_reduce_mean_default_axes_keepdims_random", - // "test_reduce_mean_do_not_keepdims_example", - // "test_reduce_mean_do_not_keepdims_random", - // "test_reduce_mean_keepdims_example", - // "test_reduce_mean_keepdims_random", - // "test_reduce_mean_negative_axes_keepdims_example", - // "test_reduce_mean_negative_axes_keepdims_random", - // "test_reduce_min_default_axes_keepdims_example", - // "test_reduce_min_default_axes_keepdims_random", - // "test_reduce_min_do_not_keepdims_example", - // "test_reduce_min_do_not_keepdims_random", - // "test_reduce_min_keepdims_example", - // "test_reduce_min_keepdims_random", - // "test_reduce_min_negative_axes_keepdims_example", - // "test_reduce_min_negative_axes_keepdims_random", - // "test_reduce_prod_default_axes_keepdims_example", - // "test_reduce_prod_default_axes_keepdims_random", - // "test_reduce_prod_do_not_keepdims_example", - // "test_reduce_prod_do_not_keepdims_random", - // "test_reduce_prod_keepdims_example", - // "test_reduce_prod_keepdims_random", - // "test_reduce_prod_negative_axes_keepdims_example", - // "test_reduce_prod_negative_axes_keepdims_random", - // "test_reduce_sum_default_axes_keepdims_example", - // "test_reduce_sum_default_axes_keepdims_random", - // "test_reduce_sum_do_not_keepdims_example", - // "test_reduce_sum_do_not_keepdims_random", + "test_reduce_max_default_axes_keepdims_random", + "test_reduce_max_do_not_keepdims_example", + "test_reduce_max_do_not_keepdims_random", + "test_reduce_max_keepdims_example", + "test_reduce_max_keepdims_random", + "test_reduce_max_negative_axes_keepdims_example", + "test_reduce_max_negative_axes_keepdims_random", + "test_reduce_mean_default_axes_keepdims_example", + "test_reduce_mean_default_axes_keepdims_random", + "test_reduce_mean_do_not_keepdims_example", + "test_reduce_mean_do_not_keepdims_random", + "test_reduce_mean_keepdims_example", + "test_reduce_mean_keepdims_random", + "test_reduce_mean_negative_axes_keepdims_example", + "test_reduce_mean_negative_axes_keepdims_random", + "test_reduce_min_default_axes_keepdims_example", + "test_reduce_min_default_axes_keepdims_random", + "test_reduce_min_do_not_keepdims_example", + "test_reduce_min_do_not_keepdims_random", + "test_reduce_min_keepdims_example", + "test_reduce_min_keepdims_random", + "test_reduce_min_negative_axes_keepdims_example", + "test_reduce_min_negative_axes_keepdims_random", + "test_reduce_prod_default_axes_keepdims_example", + "test_reduce_prod_default_axes_keepdims_random", + "test_reduce_prod_do_not_keepdims_example", + "test_reduce_prod_do_not_keepdims_random", + "test_reduce_prod_keepdims_example", + "test_reduce_prod_keepdims_random", + "test_reduce_prod_negative_axes_keepdims_example", + "test_reduce_prod_negative_axes_keepdims_random", + "test_reduce_sum_default_axes_keepdims_example", + "test_reduce_sum_default_axes_keepdims_random", + "test_reduce_sum_do_not_keepdims_example", + "test_reduce_sum_do_not_keepdims_random", "test_reduce_sum_empty_axes_input_noop_example", "test_reduce_sum_empty_axes_input_noop_random", - // "test_reduce_sum_keepdims_example", - // "test_reduce_sum_keepdims_random", - // "test_reduce_sum_negative_axes_keepdims_example", - // "test_reduce_sum_negative_axes_keepdims_random", - // "test_reduce_sum_square_default_axes_keepdims_example", - // "test_reduce_sum_square_default_axes_keepdims_random", - // "test_reduce_sum_square_do_not_keepdims_example", - // "test_reduce_sum_square_do_not_keepdims_random", - // "test_reduce_sum_square_keepdims_example", - // "test_reduce_sum_square_keepdims_random", - // "test_reduce_sum_square_negative_axes_keepdims_example", - // "test_reduce_sum_square_negative_axes_keepdims_random", + "test_reduce_sum_keepdims_example", + "test_reduce_sum_keepdims_random", + "test_reduce_sum_negative_axes_keepdims_example", + "test_reduce_sum_negative_axes_keepdims_random", + "test_reduce_sum_square_default_axes_keepdims_example", + "test_reduce_sum_square_default_axes_keepdims_random", + "test_reduce_sum_square_do_not_keepdims_example", + "test_reduce_sum_square_do_not_keepdims_random", + "test_reduce_sum_square_keepdims_example", + "test_reduce_sum_square_keepdims_random", + "test_reduce_sum_square_negative_axes_keepdims_example", + "test_reduce_sum_square_negative_axes_keepdims_random", // "test_reflect_pad", "test_relu", "test_reshape_allowzero_reordered", diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc index 469084e7b4491..c51fc1cf54815 100644 --- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc +++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc @@ -130,6 +130,11 @@ Status SparseAttention::Compute(OpKernelContext* context) const { allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V)); } + OrtValue RotaryQKV; + OrtValue RotaryQ; + OrtValue RotaryK; + T* q_rotary = Q.GetMutable()->MutableData(); + T* k_rotary = packed_qkv ? nullptr : K.GetMutable()->MutableData(); if (do_rotary_) { rotary_embedding_helper::RotaryParameters rotary_params = {}; rotary_params.batch_size = batch_size; @@ -167,30 +172,22 @@ Status SparseAttention::Compute(OpKernelContext* context) const { const T* q_input; const T* k_input; - T* q_rotary; - T* k_rotary; if (packed_qkv) { - OrtValue RotaryQKV; TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV); q_input = Q.Get().Data(); k_input = q_input + num_heads_ * sequence_length * head_size; q_rotary = RotaryQKV.GetMutable()->MutableData(); k_rotary = q_rotary + num_heads_ * sequence_length * head_size; - Q = RotaryQKV; } else { - OrtValue RotaryQ; TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ); - OrtValue RotaryK; TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK); q_input = Q.Get().Data(); k_input = K.Get().Data(); q_rotary = RotaryQ.GetMutable()->MutableData(); k_rotary = RotaryK.GetMutable()->MutableData(); - Q = RotaryQ; - K = RotaryK; } ORT_RETURN_IF_ERROR(RunRotaryEmbedding(tp, rotary_params, q_input, @@ -221,9 +218,8 @@ Status SparseAttention::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator)); // Compute the attention score and apply the score to V - return ApplyAttention(Q.Get().Data(), packed_qkv ? nullptr : K.Get().Data(), - packed_qkv ? nullptr : V.Get().Data(), past_key, past_value, - output, present_key, present_value, + return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get().Data(), + past_key, past_value, output, present_key, present_value, total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context); } } // namespace contrib diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index b5c1f73d1678d..a9bd4afc5cd09 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform); const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape"); + // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output + if (prepare_indirect_dispatch_) { + shader.AddInput("seqlen_k", ShaderUsage::None); + shader.AddOutput("indirect_buffer", ShaderUsage::None); + } shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size") << " let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n" @@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { " let sequence_id = output_indices[2];\n" " let num_head_id = output_indices[1];\n" " let batch = output_indices[0];\n"; + if (prepare_indirect_dispatch_) { + shader.MainFunctionBody() << " let total_seq_length = u32(seqlen_k[0u]) + 1u;\n"; + } else { + shader.MainFunctionBody() << " let total_seq_length = uniforms.total_sequence_length;\n"; + } + + // Add indirect dispatch logic for thread 0 + if (prepare_indirect_dispatch_) { + // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size. + shader.MainFunctionBody() << " // Prepare indirect dispatch buffer for thread 0\n" + << " if (global_idx == 0u) {\n" + << " let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n" + << " indirect_buffer[0] = num_total_seq_length_tile;\n" + << " indirect_buffer[1] = uniforms.num_heads;\n" + << " indirect_buffer[2] = 1u;\n" + << " }\n\n"; + } + if (has_past_) { - shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n"; + shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n"; if (past_present_share_buffer_) { shader.MainFunctionBody() << " let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n" << " let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n" @@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters, const Tensor* K, const Tensor* past_key, Tensor* present_key, - const Tensor* V, const Tensor* past_value, Tensor* present_value) { + const Tensor* V, const Tensor* past_value, Tensor* present_value, + uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) { // CopyKVCache takes past key/value and current key/value and copies them to present key and value. // This makes it so that FlashAttention only needs to look at present key and value, and saves // number of input buffers in the shader, which we run out of (<=8) without this optimization. + // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention. const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1); bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0; // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length. @@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_; TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components}; int64_t copy_size = copy_kv_shape.Size(); - CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_}; + + // Determine if we need to prepare indirect dispatch + bool prepare_indirect_dispatch = (indirect_buffer != nullptr); + + CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_, + prepare_indirect_dispatch}; if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) { program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, components}}); @@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}}); } + + if (prepare_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } + if (has_past && !parameters.past_present_share_buffer_) { program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components}, {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); } program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components}, - {present_value, ProgramTensorMetadataDependency::Rank, components}}) - .AddIndices(std::move(copy_kv_shape)); + {present_value, ProgramTensorMetadataDependency::Rank, components}}); + + if (prepare_indirect_dispatch) { + program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None}); + } + + program.AddIndices(std::move(copy_kv_shape)); program.SetDispatchGroupSize(static_cast((copy_size + 63) / 64)) .SetWorkgroupSize(64) - .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_) + .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch) .AddUniformVariables({{static_cast(copy_size)}, - // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to - // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length. - {static_cast(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}}); + {static_cast(parameters.total_sequence_length_)}, + {static_cast(parameters.kv_sequence_length_)}, + {tile_size}, + {static_cast(parameters.num_heads_)}}); return context.RunProgram(program); } @@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } if (has_attention_bias_) { shader.AddInput("attention_bias", ShaderUsage::UseUniform); } @@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q, - const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, - const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile, uint32_t tile_size, - uint32_t present_sequence_length) { + const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k, + const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) { const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast(parameters.head_size_)) : parameters.scale_; const bool has_attention_bias = attention_bias != nullptr; const int components = 4; - FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size}; + FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch}; program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components}, {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } if (has_attention_bias) { program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank}); } @@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte {metadata, ProgramTensorMetadataDependency::Rank, 2}}); const uint32_t vectorized_head_size = parameters.head_size_ / components; - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .SetWorkgroupSize(64) - .CacheHint(tile_size, has_attention_bias) + if (use_indirect_dispatch) { + program.SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.SetWorkgroupSize(64) + .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch) .AddUniformVariables({{static_cast(vectorized_head_size)}, {static_cast(parameters.total_sequence_length_)}, {static_cast(alpha)}, present_sequence_length, {static_cast(parameters.n_reps)}, - {num_total_seq_length_tile}, {num_present_sequence_length_tile}, {static_cast(parameters.num_heads_)}}); @@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad shader.AddInput("metadata", ShaderUsage::UseUniform); shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("out_split_vx", ShaderUsage::UseUniform); const uint32_t tile_size_k_vec = 8u; @@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context, @@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte const Tensor* qk, Tensor* out_split_vx, Tensor* present_value, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, + const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, + bool use_indirect_dispatch, uint32_t present_sequence_length) { const int components = 4; int head_size_vec = parameters.v_head_size_ / components; - FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec}; + FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch}; program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2}, {qk, ProgramTensorMetadataDependency::TypeAndRank}, {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); // [B, N, split_k, head_size] - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .CacheHint(tile_size, head_size_vec) + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}) + .SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch) .SetWorkgroupSize(64) .AddUniformVariables({{static_cast(parameters.total_sequence_length_)}, {static_cast(head_size_vec)}, present_sequence_length, {static_cast(parameters.n_reps)}, - num_total_seq_length_tile, num_present_sequence_length_tile, {static_cast(parameters.num_heads_)}}); @@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("input", ShaderUsage::UseUniform); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template", - WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_)); + WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_), + WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context, const Tensor* out_split_vx, Tensor* output, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile) { + uint32_t num_present_sequence_length_tile, + uint32_t seq_tile_size, + bool use_indirect_dispatch) { const int components = 4; constexpr int tile_size = 8; int tile_head_size = tile_size * components; - FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size}; + FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch}; program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}}); const uint32_t num_head_size_tile = static_cast((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size); program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile) - .CacheHint(tile_size) + .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch) .SetWorkgroupSize(tile_size * tile_size) .AddUniformVariables({{static_cast(parameters.v_head_size_ / components)}, num_total_seq_length_tile, @@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias, Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value, - const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) { - ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value)); - + const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) { // Extract present_sequence_length directly from present_key tensor shape: // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size) const uint32_t present_sequence_length = static_cast(present_key->Shape()[2]); + if (parameters.sequence_length_ > 1) { const uint32_t tile_size = 64; + // For encode path, use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); bool has_attention_bias = attention_bias != nullptr; bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"}; bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"}; @@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co return context.RunProgram(program); } - // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled. + // For decode path (sequence_length == 1) const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_, parameters.sequence_length_, present_sequence_length}); const TensorShape qk_shape(qk_dims); @@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co constexpr uint32_t tile_size = 64; const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size; const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size; + + // Determine if we should use indirect dispatch + const bool use_indirect_dispatch = parameters.past_present_share_buffer_ && + seqlen_k != nullptr && + context.IsGraphCaptureEnabled(); + + // Create indirect dispatch buffer if using indirect dispatch + Tensor* indirect_buffer_ptr = nullptr; + Tensor indirect_buffer; + if (use_indirect_dispatch) { + const TensorShape indirect_buffer_shape{3}; // 3 uint32 values for dispatch dimensions + indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType(), indirect_buffer_shape); + indirect_buffer_ptr = &indirect_buffer; + // Use the fused CopyKVCache that also prepares the indirect dispatch buffer + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr)); + } else { + // Use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); + } + // The metadata is used to store the max and sum of each tile. const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, 2}); const TensorShape metadata_shape(metadata_dims); Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType(), metadata_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, - parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k, + parameters, indirect_buffer_ptr, num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch, present_sequence_length)); - const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_}); + const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, + num_present_sequence_length_tile, parameters.head_size_}); const TensorShape out_split_vx_shape(out_split_vx_dims); Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters, - num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length)); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, + seqlen_k, parameters, indirect_buffer_ptr, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, + use_indirect_dispatch, present_sequence_length)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch)); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index c75494df253c1..7d71dc0f4d42d 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu; class CopyKVCacheProgram final : public Program { public: - CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer) - : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) { + CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer, + bool prepare_indirect_dispatch = false) + : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32}, - {"past_sequence_length", ProgramUniformVariableDataType::Uint32}); + {"total_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"kv_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"tile_size", ProgramUniformVariableDataType::Uint32}, + {"num_heads", ProgramUniformVariableDataType::Uint32}); private: bool has_past_; bool kv_BNSH_; bool past_present_share_buffer_; + bool prepare_indirect_dispatch_; }; class FlashAttentionProgram final : public Program { @@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program { class FlashAttentionDecodeQKTProgram final : public Program { public: FlashAttentionDecodeQKTProgram(const std::string& kernel_name, - bool has_attention_bias, uint32_t tile_size) - : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) { + bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program { public: - FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec) - : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) { + FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program { public: - FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size) - : Program{kernel_name}, tile_size_(tile_size) { + FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program tile_qk: array; $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); +#if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; +#else + let total_sequence_length = uniforms.total_sequence_length; +#endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let q_offset = head_idx * uniforms.head_size_vec; - var total_sequence_length = uniforms.total_sequence_length; let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec; for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) { if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) { @@ -95,7 +101,7 @@ $MAIN { for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) { l_sum += exp(f32(tile_qk[i]) - l_max); } - let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile; + let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile; metadata[meta_offset] = metadata_value_t(l_max, l_sum); } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template index c7593af311ce2..37cf7e8f11b1f 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template @@ -5,6 +5,7 @@ #param head_size_vec #param tile_size_k_vec #param sub_tile_count +#param use_indirect_dispatch // Note that this shader adopts similar algorithm with dp4a generation shader. // @@ -40,9 +41,14 @@ var qkv_values: array, $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); - var total_sequence_length = uniforms.total_sequence_length; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + #else + let total_sequence_length = uniforms.total_sequence_length; + #endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length; // Calculate the global max and sum in qk. @@ -50,12 +56,12 @@ $MAIN { { var g_max = f32(-3.402823e+38f); var g_sum = f32(0); - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; g_max = max(g_max, metadata[meta_offset].x); } - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; let m_value = metadata[meta_offset]; @@ -95,7 +101,7 @@ $MAIN { } for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) { - let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i; + let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i; out_split_vx[out_offset] = tile_output[i]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template index a4381baa638ce..22f18655307de 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#param seq_tile_size #param tile_size +#param use_indirect_dispatch // Inputs are splits of the GQA output, split into num_total_seq_length_tiles // rows. This shader needs to add these splits across the row dimension to @@ -23,10 +25,16 @@ $MAIN { var value = output_value_t(0); let local_row = u32(local_idx / tile_size); let local_col = local_idx % tile_size; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size; + #else + let num_total_seq_length_tile = uniforms.num_total_seq_length_tile; + #endif if (head_size_offset + local_col < uniforms.head_size_vec) { - for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) { - if (r + local_row < uniforms.num_total_seq_length_tile) { + for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) { + if (r + local_row < num_total_seq_length_tile) { value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc index 8b7b257dd2852..cb845061404f3 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc @@ -206,7 +206,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& !use_sliding_window && CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) { return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value, - present_value, parameters, context); + present_value, parameters, context, seqlen_k); } Tensor qSplit; diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 3f6443aa73d4c..8b599dc86d997 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { // only return data if it's for a constant initializer. checks for outer scope initializers // if this is a subgraph and the name isn't found locally. const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true); + if (initializer != nullptr) { + // Check if this is in-memory external data (data stored in OrtValue) + // ONNX shape inference cannot handle external data, so we need to materialize it + if (utils::HasExternalDataInMemory(*initializer)) { + // Try to get the OrtValue for this initializer + OrtValue ort_value; + if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) { + // Create a temporary TensorProto with the actual data from the OrtValue + // This allows ONNX shape inference to access the data + const Tensor& tensor = ort_value.Get(); + auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false); + // Store the temporary proto so it outlives this call, maintain pointers steady + temp_tensor_protos_.push_back(std::make_unique(std::move(temp_tensor_proto))); + return temp_tensor_protos_.back().get(); + } else { + // If we can't get the OrtValue, it is a bug + ORT_THROW("Initializer ", def->Name(), + " has in-memory external data but cannot get OrtValue during shape inference"); + } + } + } return initializer; } @@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { std::vector> graph_inferencers_; const Graph& graph_; const Graph::ResolveOptions& options_; + // Temporary TensorProtos created for in-memory external data during shape inference + // These need to outlive the shape inference call, so we store them here + // Inference is per node and the instance of this context is on the stack, + // so this is safe. + mutable InlinedVector> temp_tensor_protos_; }; Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph, diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index 5136061c4769d..2e9c4574fd057 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -115,3 +115,37 @@ MlasConv( MLAS_THREADPOOL* ThreadPool ); } + +/*++ + +Routine Description: + + This routine determines if a wraparound will occur when multiplying two size_t variables + Uses __builtin_mul_overflow if available on the current system and if not falls back + to a default implementation to check this wraparound. + +Arguments: + + a - Supplies the first number to be muliplied. + + b - Supplies the second number to be muliplied. + + out - pointer to a size_t which acts as the return value in success cases. + +Return Value: + + Returns false if the operation was successful + Returns true if wraparound of size_t was detected + +--*/ +inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) { +#if defined(__has_builtin) +# if __has_builtin(__builtin_mul_overflow) + return __builtin_mul_overflow(a, b, out); +# endif +#endif + // Fallback to manual check if builtin not available + if (b != 0 && a > SIZE_MAX / b) return true; + if (out) *out = a * b; + return false; +} diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index ea38f16205a7c..435ff1fb10017 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -14,6 +14,16 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h" #include "mlasi_kleidiai.h" + +// Thread-local reusable buffers to reduce allocation overhead across tiles. +struct KaiTlsBuffers { + std::vector output_tile; + std::vector bias_zero; + std::vector rhs_packed; + std::vector lhs_packed; +}; +static thread_local KaiTlsBuffers g_kai_tls; + size_t MLASCALL ArmKleidiAI::MlasGemmPackBSize( @@ -51,7 +61,6 @@ Return Value: // Compute the number of bytes required to hold the packed buffer. // size_t bytes = 0; - if (TransA == CblasNoTrans) { switch (TransB) { case CblasNoTrans: @@ -125,15 +134,15 @@ Return Value: const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - // pass zeroed bias values - const std::vector bias(N); + // Ensure size and zero the used span. + g_kai_tls.bias_zero.resize(N, 0.0f); switch (TransB) { case CblasNoTrans: - kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; case CblasTrans: - kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; default: return false; @@ -225,22 +234,29 @@ Return Value: size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - if (M < m_step && N < n_step && !Data->BIsPacked) { + if ((M < m_step || N < n_step) && !Data->BIsPacked) { // Fallback to MLAS return false; } - std::vector KaiPackedData; - KaiPackedData.resize(BatchSize); - size_t LhsPackedStride = 0; std::byte* LhsPackedData = nullptr; LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr); - auto LhsPacked = std::make_unique(LhsPackedStride * BatchSize); - LhsPackedData = LhsPacked.get(); - std::unique_ptr RhsPacked{nullptr}; + size_t lhs_resize = 0; + if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize)) + { + // size_t wraparound detected for LhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.lhs_packed.resize(lhs_resize); + LhsPackedData = g_kai_tls.lhs_packed.data(); + + // RHS packed buffer: use TLS reusable vector to minimize allocations + size_t RhsPackedStride = 0; + std::byte* RhsPackedData = nullptr; // It is assumed all B batches require packing or not if (Data[0].BIsPacked) { @@ -248,36 +264,31 @@ Return Value: MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) { std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); - KaiPackedData[batch_idx].B = Data[batch_idx].B; }); } else { // Multithread pack lhs and rhs - size_t RhsPackedStride = 0; - std::byte* RhsPackedData = nullptr; - RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K); - RhsPacked = std::make_unique(RhsPackedStride * BatchSize); - RhsPackedData = RhsPacked.get(); + size_t rhs_resize = 0; + if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize)) + { + // size_t wraparound detected for RhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.rhs_packed.resize(rhs_resize); + RhsPackedData = g_kai_tls.rhs_packed.data(); MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) { - // lhs odd, rhs even if (batch_idx & 0x1) { batch_idx >>= 1; - std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); - kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); } else { batch_idx >>= 1; - std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]); - - ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr); - - KaiPackedData[batch_idx].B = reinterpret_cast(RhsPackedPtr); + ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, + reinterpret_cast(Data[batch_idx].B), + Data[batch_idx].ldb, RhsPackedPtr); } }); } @@ -303,6 +314,14 @@ Return Value: dim[1] = MlasDivRoundup(M, m_step); dim[2] = MlasDivRoundup(N, n_step); + // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop. + // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively. + size_t max_tile_elems = 0; + if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) { + // size_t wraparound detected for tile size, fallback to MLAS + return false; + } + MlasTrySimpleParallel(ThreadPool, static_cast(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) { // compute B,M,N index from iteration index ptrdiff_t BIdx = tid / (dim[1] * dim[2]); @@ -314,18 +333,18 @@ Return Value: UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K) : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K); - auto BTile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].B) + rhs_packed_offset - ); + const std::byte* B_base = Data[0].BIsPacked + ? reinterpret_cast(Data[BIdx].B) + : (RhsPackedData + RhsPackedStride * BIdx); + auto BTile = reinterpret_cast(B_base + rhs_packed_offset); // Get lhs tile, A const size_t lhs_packed_offset = UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K) : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K); - auto ATile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].A) + lhs_packed_offset - ); + const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx; + auto ATile = reinterpret_cast(A_base + lhs_packed_offset); auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step; auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step; @@ -336,9 +355,14 @@ Return Value: MIdx * m_step * Data[BIdx].ldc * sizeof(float) + NIdx * n_step * sizeof(float) ); - // Allocate temporary buffer for raw A*B result - std::vector OutputTile(TileSizeM * TileSizeN, 0.0f); - float* temp_tile = OutputTile.data(); + // Allocate temporary buffer for raw A*B result (TLS reusable buffer) + size_t tile_elems = TileSizeM * TileSizeN; + + // resize the tile to the required size + g_kai_tls.output_tile.resize(tile_elems); + + float* temp_tile = g_kai_tls.output_tile.data(); + std::fill_n(temp_tile, tile_elems, 0.0f); if (UseSME2) { kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa( diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index a59347841be95..55f901164bdac 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) { case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT: + case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2: @@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type, case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ: mgx_type = migraphx_shape_fp8e5m2fnuz_type; break; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1: + mgx_type = migraphx_shape_fp4x2_type; + break; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4: mgx_type = migraphx_shape_int8_type; break; @@ -949,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer, "QLinearAdd", "QLinearConv", "QLinearMatMul", + "QLinearAveragePool", + "QLinearGlobalAveragePool", "QuantizeLinear", "QuickGelu", "DynamicQuantizeLinear", diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 508d932459bf9..cd0c0e4bffdb5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior. trt_state->context->reset(); trt_state->engine->reset(); + + // Clear dds output allocator map since the engine and context will be recreated. + dds_output_allocator_map.clear(); + auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); if (max_workspace_size_ > 0) { trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h index 315115390ff23..c4a88754deffe 100644 --- a/onnxruntime/core/providers/webgpu/compute_context.h +++ b/onnxruntime/core/providers/webgpu/compute_context.h @@ -8,6 +8,7 @@ #include #include "core/framework/execution_provider.h" +#include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/program.h" #include "core/providers/webgpu/webgpu_context.h" @@ -16,7 +17,6 @@ namespace onnxruntime { class Tensor; -class WebGpuExecutionProvider; namespace webgpu { @@ -42,6 +42,9 @@ class ComputeContext { inline bool HasFeature(wgpu::FeatureName feature) const { return webgpu_context_.DeviceHasFeature(feature); } + inline bool IsGraphCaptureEnabled() const { + return ep_.IsGraphCaptureEnabled(); + } #if !defined(__wasm__) inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const { return webgpu_context_.SubgroupMatrixConfigs(); diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc index 5998c22a0d2ca..aa1f6c9a0ec0b 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.cc +++ b/onnxruntime/core/providers/webgpu/shader_variable.cc @@ -378,7 +378,7 @@ std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std:: ORT_THROW("Invalid type"); break; case onnxruntime::webgpu::ProgramVariableDataType::Int64: - ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));"; + ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));"; break; case onnxruntime::webgpu::ProgramVariableDataType::Uint64: ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), 0u);"; diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc index 313a96ba25509..daf4aa323c12e 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.cc +++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc @@ -11,75 +11,29 @@ namespace onnxruntime { namespace webgpu { namespace { -const std::vector& CastOpTypeConstraints() { - // currently support boolean, integer and float types that explicitly allowed in WGSL: +const std::vector& CastOpTypeConstraints(bool enable_graph_capture) { + // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL: // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section - // - static std::vector types{ + static std::vector base_types{ DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}; - return types; + + if (enable_graph_capture) { + static std::vector types_with_int64 = []() { + auto types = base_types; + types.push_back(DataTypeImpl::GetTensorType()); + return types; + }(); + return types_with_int64; + } else { + return base_types; + } } } // namespace -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 6, 8, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 9, 12, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 13, 18, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 19, 20, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 21, 22, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_KERNEL_EX( - Cast, - kOnnxDomain, - 23, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); - Status Cast::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); auto* output_tensor = context.Output(0, input_tensor->Shape()); @@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const { if (size == 0) { return Status::OK(); } + bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType(); + const int in_components = is_from_int64 ? 1 : 4; + const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4; uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); + uint32_t in_vec_size = onnxruntime::narrow(in_components == 1 ? size : vec_size); + uint32_t out_vec_size = onnxruntime::narrow(out_components == 1 ? size : vec_size); - CastProgram program{to_}; + CastProgram program{to_, is_from_int64}; program - .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}) - .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4}) + .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components}) + .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components}) .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({ {static_cast(vec_size)}, @@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const { case ONNX_NAMESPACE::TensorProto_DataType_BOOL: expression = "vec4(a)"; break; + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + expression = "int32(a)"; + break; default: ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported."); } - sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size") - << " let a = " << input.GetByOffset("global_idx") << ";\n " - << output.SetByOffset("global_idx", expression); + + sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"); + if (is_from_int64_) { + sh.MainFunctionBody() << " let a0 = " << input.GetByOffset("global_idx * 4") << ";\n" + << " let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n" + << " let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n" + << " let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n" + << " let a = vec4(a0, a1, a2, a3);\n"; + } else { + sh.MainFunctionBody() << " let a = " << input.GetByOffset("global_idx") << ";\n"; + } + if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n" + << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n" + << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n" + << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n"; + } else { + sh.MainFunctionBody() << output.SetByOffset("global_idx", expression); + } return Status::OK(); } +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) { + const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture); + + KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr& out) -> Status { + out = std::make_unique(info); + return Status::OK(); + }; + + if constexpr (StartVersion == EndVersion) { + // Non-versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } else { + // Versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion, EndVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } +} + +// Explicit template instantiations +template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool); +template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool); +template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool); +template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool); +template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool); +template KernelCreateInfo CreateCastKernelInfo<23>(bool); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h index 925cd200f0aba..7dfb50e3241c8 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.h +++ b/onnxruntime/core/providers/webgpu/tensor/cast.h @@ -3,6 +3,8 @@ #pragma once +#include "core/framework/kernel_registry.h" +#include "core/framework/op_kernel.h" #include "core/providers/webgpu/webgpu_kernel.h" namespace onnxruntime { @@ -10,7 +12,7 @@ namespace webgpu { class CastProgram final : public Program { public: - CastProgram(int32_t to) : Program{"Cast"}, to_{to} {} + CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {} Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -18,6 +20,7 @@ class CastProgram final : public Program { private: int32_t to_; + bool is_from_int64_; }; class Cast final : public WebGpuKernel { @@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel { int32_t to_; }; +// Create Cast kernel info with appropriate type constraints based on graph capture support +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc index 7c3aced3f0295..cab1dc03848b9 100644 --- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc @@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const { data_dim += indices_innerest_dim_; for (uint32_t i = 0; i < static_cast(data.Rank() - data_dim); i++) { - shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; + shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; } shader.MainFunctionBody() << " " << output.SetByOffset("global_idx", data.GetByIndices("data_indices")); diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index bbb3fbdd221d3..0f7607ac1dbfe 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -28,6 +28,7 @@ #include "core/providers/webgpu/data_transfer.h" #include "core/providers/webgpu/external_data_loader.h" #include "core/providers/webgpu/webgpu_profiler.h" +#include "core/providers/webgpu/tensor/cast.h" namespace onnxruntime { @@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND); -std::unique_ptr RegisterKernels() { +std::unique_ptr RegisterKernels(bool enable_graph_capture = false) { auto kernel_registry = std::make_unique(); static const BuildKernelCreateInfoFn function_table[] = { @@ -464,13 +465,6 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO(13, Tanh), KERNEL_CREATE_INFO(1, Not), - KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast), - KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast), - KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast), - KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast), - KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast), - KERNEL_CREATE_INFO(23, Cast), - // // activations BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -771,6 +765,14 @@ std::unique_ptr RegisterKernels() { } } + // Register Cast kernels with conditional int64 support based on graph capture + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture))); + #ifndef DISABLE_CONTRIB_OPS Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry); ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage()); @@ -869,9 +871,13 @@ std::vector> WebGpuExecutionProvider::GetCapa } std::shared_ptr WebGpuExecutionProvider::GetKernelRegistry() const { - static std::shared_ptr registry = webgpu::RegisterKernels(); - - return registry; + if (enable_graph_capture_) { + static std::shared_ptr registry = webgpu::RegisterKernels(true); + return registry; + } else { + static std::shared_ptr registry = webgpu::RegisterKernels(false); + return registry; + } } std::unique_ptr WebGpuExecutionProvider::GetDataTransfer() const { diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index baedb98a34c28..fbabc23504636 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type); // Collects all the initializer tensors in the subGraph and its ancestor graphs. InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer); -inline std::vector HandleNegativeAxes(const std::vector& axes, size_t input_size) { +inline std::vector HandleNegativeAxes(const gsl::span axes, size_t input_size) { std::vector new_axes(axes.size()); for (size_t i = 0; i < axes.size(); ++i) { new_axes[i] = HandleNegativeAxis(axes[i], input_size); diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 0ea927967d989..5a80f01c17236 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&, std::vector c_shape; if (!GetShape(*input_defs[c_idx], c_shape, logger)) return false; - - size_t c_dim = c_shape.size(); - - if (c_dim > 1) { - // TODO: Supports other shape of C. - // Currently WebNN implementation in Chromium only supports 1-D C. - return false; - } - if (c_dim == 0) { - LOGS(logger, VERBOSE) << "C of Gemm is a scalar"; - } else { - auto c_size = c_shape[c_dim - 1]; - NodeAttrHelper helper(node); - const auto transB = helper.Get("transB", 0); - if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) { - LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape[" - << (transB == 0 ? "1" : "0") << "]" - << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]" - << " c_size: " << c_size; - - return false; - } - } } } diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc index 6ea9b0a440d93..d07e636d578b1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc @@ -19,6 +19,8 @@ namespace webnn { class ReductionOpBuilder : public BaseOpBuilder { // Add operator related. public: + // Allow axes potentially being empty inputs that are ignored during processing. + ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {} void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override; // Add operator related. @@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons const auto& input_defs = node.InputDefs(); if (input_defs.size() > 1) { model_builder.AddInitializerToSkip(input_defs[1]->Name()); // axes + model_builder.AddInputToSkip(input_defs[1]->Name()); // axes } } @@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); const auto keep_dims = helper.Get("keepdims", 1); + emscripten::val options = emscripten::val::object(); options.set("label", node.Name()); options.set("keepDimensions", keep_dims == 1); - std::vector axes_data; - - emscripten::val output = emscripten::val::object(); + std::vector axes_data; const auto opset = node.SinceVersion(); const auto& op_type = node.OpType(); if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) { // 'axes' is an optional input. - const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0); - if (!GetTensorName(input_defs, 1).empty()) { - // Optional input axes is provided, use axes initializer data. - const auto& initializers(model_builder.GetInitializerTensors()); - const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); - Initializer axes_initializer(axes_tensor); - const auto axes_data_span = axes_initializer.DataAsSpan(); - std::transform( - axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); - } else { - if (noop_with_empty_axes) { - // When axes is empty and this attribute is set to true, input tensor will not be reduced. - output = input; - model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); - return Status::OK(); + std::vector axes_shape; + if (TensorExists(input_defs, 1)) { + ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes"); + if (axes_shape[0] != 0) { + // Optional input axes is provided and we already ensure it is an initializer. + // Use that initializer data. + const auto& initializers(model_builder.GetInitializerTensors()); + const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); + Initializer axes_initializer(axes_tensor); + const auto axes_data_span = axes_initializer.DataAsSpan(); + axes_data = HandleNegativeAxes(axes_data_span, input_rank); } } } else { if (helper.HasAttr("axes")) { - auto axes = helper.Get("axes", std::vector{}); - std::transform( - axes.begin(), axes.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); + axes_data = GetResolvedAxes(helper, input_rank); } } - if (axes_data.size() > 0) { - options.set("axes", emscripten::val::array(axes_data)); - } - if (op_type == "ReduceL1") { - output = model_builder.GetBuilder().call("reduceL1", input, options); - } else if (op_type == "ReduceL2") { - output = model_builder.GetBuilder().call("reduceL2", input, options); - } else if (op_type == "ReduceLogSum") { - output = model_builder.GetBuilder().call("reduceLogSum", input, options); - } else if (op_type == "ReduceLogSumExp") { - output = model_builder.GetBuilder().call("reduceLogSumExp", input, options); - } else if (op_type == "ReduceMax") { - output = model_builder.GetBuilder().call("reduceMax", input, options); - } else if (op_type == "ReduceMean") { - output = model_builder.GetBuilder().call("reduceMean", input, options); - } else if (op_type == "ReduceMin") { - output = model_builder.GetBuilder().call("reduceMin", input, options); - } else if (op_type == "ReduceProd") { - output = model_builder.GetBuilder().call("reduceProduct", input, options); - } else if (op_type == "ReduceSum") { - output = model_builder.GetBuilder().call("reduceSum", input, options); - } else if (op_type == "ReduceSumSquare") { - output = model_builder.GetBuilder().call("reduceSumSquare", input, options); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type); + // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute: + // - If it is false, perform reduction over all dimensions. + // (In WebNN, this means the 'axes' option is not set.) + // - If it is true, no reduction is applied, but other operations are still performed. + // (In WebNN, this requires setting 'axes' to an empty array.) + if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) { + options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64(axes_data))); } + const std::string_view webnn_op_type = GetWebNNOpType(op_type); + ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type"); + + emscripten::val output = model_builder.GetBuilder().call( + std::string(webnn_op_type).c_str(), input, options); + model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } @@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); - const std::string axes_name = GetTensorName(input_defs, 1); - // If the optional input 'axes' is provided, it must be an initializer. - if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) { - LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; - return false; + + if (TensorExists(input_defs, 1)) { + std::vector axes_shape; + if (!GetShape(*input_defs[1], axes_shape, logger)) { + LOGS(logger, VERBOSE) << "Cannot get shape of input axes"; + return false; + } + + if (axes_shape.size() != 1) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D"; + return false; + } + + const std::string axes_name = GetTensorName(input_defs, 1); + // If the optional input 'axes' is provided and not empty, it must be an initializer. + if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; + return false; + } } return true; diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py index 4a72916d3e485..6912d19897d67 100644 --- a/onnxruntime/python/onnxruntime_validation.py +++ b/onnxruntime/python/onnxruntime_validation.py @@ -23,9 +23,9 @@ def check_distro_info(): __my_distro__ = __my_system__ __my_distro_ver__ = platform.release().lower() - if __my_distro_ver__ not in ["10", "11"]: + if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]: warnings.warn( - f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only." + f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above." ) elif __my_system__ == "linux": """Although the 'platform' python module for getting Distro information works well on standard OS images diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index d6b39a6b2aeb4..66ab0c44f8814 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch): for ref_o, o in zip(ref_output, output, strict=False): # abs(desired-actual) < rtol * abs(desired) + atol try: - np.testing.assert_allclose(ref_o, o, rtol, atol) + np.testing.assert_allclose(o, ref_o, rtol, atol) except Exception as e: if percentage_in_allowed_threshold(e, percent_mismatch): continue diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc index 574ec49da67ea..3bf37ea193245 100644 --- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc +++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc @@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector& data, const std::vector& s ORT_ENFORCE(static_cast(data.size()) == total_elements, "Data size does not match the shape", "Data size: ", data.size(), ", Expected size: ", total_elements, - ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name()); + ", Shape: ", VectorToString(shape), " Name:", name); } // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 4fd9830440846..7371ad5cf0ded 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -2,13 +2,17 @@ // Licensed under the MIT License. #include +#include #include "core/common/inlined_containers.h" #include "core/common/span_utils.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" #include "core/graph/op.h" +#include "core/session/inference_session.h" +#include "core/session/environment.h" #include "test/providers/provider_test_utils.h" +#include "test/test_environment.h" #include "gtest/gtest.h" #include "gmock/gmock.h" #include "onnx/defs/function.h" @@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener #endif +// Test for shape inference with in-memory external data (issue #26261) +// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes) +// stored as in-memory external data would cause shape inference to fail +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) { + // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes) + // This will trigger the in-memory externalization path + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + // Add the value attribute with a tensor + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); // 16 elements * 8 bytes = 128 bytes + // Each split will be size 1, totaling 16 + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node that uses the constant as input + // Split requires constant input for the 'split' parameter, which triggers shape inference + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); // Use constant as split sizes + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + // Add axis attribute + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load the model - this should succeed with the fix + // Before the fix, this would fail with: + // "Cannot parse data from external tensors. Please load external data into raw data for tensor" + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + // Verify the graph was properly constructed + Graph& graph = model->MainGraph(); + ASSERT_STATUS_OK(graph.Resolve()); + + // Verify the constant node was converted to an initializer + const ONNX_NAMESPACE::TensorProto* initializer = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer)); + ASSERT_NE(initializer, nullptr); + + // Verify the Split node can access the constant data during shape inference + const Node* split_node_ptr = nullptr; + for (const auto& node : graph.Nodes()) { + if (node.Name() == "split_node") { + split_node_ptr = &node; + break; + } + } + ASSERT_NE(split_node_ptr, nullptr); + + // Verify outputs are properly shaped + ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u); +} + +// Test for shape inference with in-memory external data using InferenceSession +// This test more accurately reproduces the issue by going through the full session initialization +// which includes graph optimizations that trigger the in-memory externalization +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) { + // Create the same model as above + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Save to a temporary file + const std::string model_path = "test_in_memory_external_data.onnx"; + { + std::ofstream file(model_path, std::ios::binary); + ASSERT_TRUE(file.is_open()); + ASSERT_TRUE(model_proto.SerializeToOstream(&file)); + } + + // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix + SessionOptions so; + so.graph_optimization_level = TransformerLevel::Default; // This triggers the issue + so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession"; + + InferenceSession session_object{so, GetEnvironment()}; + + // This should succeed with the fix, fail without it + ASSERT_STATUS_OK(session_object.Load(model_path)); + ASSERT_STATUS_OK(session_object.Initialize()); + + // Clean up + std::remove(model_path.c_str()); +} + +// Test that explicitly triggers the in-memory externalization and then shape inference +// This test directly reproduces the bug scenario +TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) { + // Create a model with a Split node that depends on a constant initializer + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create initializer directly (not as Constant node) with 128 bytes + auto* initializer = graph_proto->add_initializer(); + initializer->set_name("split_sizes"); + initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + initializer->add_dims(16); // 16 * 8 = 128 bytes + for (int64_t i = 0; i < 16; ++i) { + initializer->add_int64_data(1); + } + + // Create a Split node that uses this initializer + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("split_sizes"); // Uses the large initializer + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load model + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + Graph& graph = model->MainGraph(); + // First resolve should succeed + ASSERT_STATUS_OK(graph.Resolve()); + + // Now trigger the in-memory externalization + // This converts initializers > 127 bytes to OrtValues with external data references + Status convert_status = graph.ConvertInitializersIntoOrtValues(); + ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage(); + + // Check if the initializer was actually externalized + const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after)); + ASSERT_NE(initializer_after, nullptr); + // Debug: verify it was externalized + ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after)) + << "Initializer was not externalized to in-memory external data"; + + // Mark the graph as needing resolve to force shape inference to run again + graph.SetGraphResolveNeeded(); + + // Resolve again - this should trigger shape inference with the externalized initializer + // Without the fix, this will fail with "Cannot parse data from external tensors" + // With the fix, getInputData() materializes the external data for shape inference + Status second_resolve = graph.Resolve(); + ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage(); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index 6df98ff505fa1..cbb25bb9b629e 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -1435,9 +1435,22 @@ std::unique_ptr> GetBrokenTests(const std::string& provider broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"}); // Fails since ONNX==1.19.0 broken_tests->insert({"l2normalization_axis_0", "unknown version"}); + broken_tests->insert({"attention_3d_gqa", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"}); broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"}); } #ifdef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index b6f2cb2683677..463634b370d4c 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); // Please make no more changes to the list static const ORTCHAR_T* immutable_broken_tests[] = { - // pending ONNX update - ORT_TSTR("attention_3d_gqa"), - ORT_TSTR("attention_3d_gqa_attn_mask"), - ORT_TSTR("attention_3d_gqa_causal"), - ORT_TSTR("attention_3d_gqa_scaled"), - ORT_TSTR("attention_3d_gqa_softcap"), - ORT_TSTR("attention_3d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_gqa"), - ORT_TSTR("attention_4d_gqa_attn_mask"), - ORT_TSTR("attention_4d_gqa_causal"), - ORT_TSTR("attention_4d_gqa_scaled"), - ORT_TSTR("attention_4d_gqa_softcap"), - ORT_TSTR("attention_4d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"), - ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"), - // unsupported case ORT_TSTR("AvgPool1d"), ORT_TSTR("AvgPool1d_stride"), ORT_TSTR("AvgPool2d"), diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cf49601e6c671..ca1a3104e0bed 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -678,7 +678,14 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("fp16_coreml_FNS-Candy"), ORT_TSTR("fp16_test_tiny_yolov2"), ORT_TSTR("fp16_test_shufflenet"), - ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"), + // models from model zoo. #26274: cuDNN frontend no valid engine + ORT_TSTR("YOLOv3"), + ORT_TSTR("YOLOv3-12"), + ORT_TSTR("YOLOv4"), + ORT_TSTR("SSD-MobilenetV1"), + ORT_TSTR("SSD-MobilenetV1-12")}; + // For ROCm EP, also disable the following tests due to flakiness, // mainly with precision issue and random memory access fault. static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc index 8f2eac2d05792..ed67b531ef394 100644 --- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -1480,7 +1480,7 @@ template void CastOpTestFloatFloat4(std::vector shape, std::vector float_data, bool is_fp4_input = false) { - size_t num_pairs = float_data.size() / 2; + int num_pairs = static_cast(float_data.size()) / 2; int num_fp4_elements = static_cast((float_data.size() + 1) / 2); bool is_odd_count = (float_data.size() % 2 != 0); diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index 327dfab96c2d1..a746493d779f8 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) { ASSERT_TRUE(status.IsOK()); } +TEST(TensorrtExecutionProviderTest, DDSOutputTest) { + PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx"); + SessionOptions so; + so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + InferenceSession session_object{so, GetEnvironment()}; + auto cuda_provider = DefaultCudaExecutionProvider(); + auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1]; + std::vector dims_op_x = {3, 4}; + std::vector values_op_x(12, 0.f); // 12=3*4 + OrtValue ml_value_x; + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("data", ml_value_x)); + + // prepare outputs + std::vector output_names; + output_names.push_back("output"); + std::vector fetches; + + OrtTensorRTProviderOptionsV2 params; + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // First pass run + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + + // Second pass run with new shape + dims_op_x = {6, 4}; + values_op_x.resize(24, 0.f); // 24=6*4 + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + feeds.clear(); + + feeds.insert(std::make_pair("data", ml_value_x)); + + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); +} + TEST_P(TensorrtExecutionProviderCacheTest, Run) { // GetParam() returns the parameter of following format: // ##cache type##_##input shape type## diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index 72c6a5664f395..d2e9557f633b0 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None """ def assert_similar_array(ref_output, output): - np.testing.assert_equal(ref_output.dtype, output.dtype) + np.testing.assert_equal(output.dtype, ref_output.dtype) if ref_output.dtype == object: - np.testing.assert_array_equal(ref_output, output) + np.testing.assert_array_equal(output, ref_output) else: - np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol) + np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol) - np.testing.assert_equal(len(ref_outputs), len(outputs)) + np.testing.assert_equal(len(outputs), len(ref_outputs)) for i in range(len(outputs)): # pylint: disable=consider-using-enumerate if isinstance(outputs[i], list): for j in range(len(outputs[i])): diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index e44adcdb9827f..7f003453add89 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -54,7 +54,7 @@ def run_model(self, session_object, run_options): input_name = session_object.get_inputs()[0].name res = session_object.run([], {input_name: x}, run_options=run_options) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue): for _ in range(iter_num): @@ -714,7 +714,7 @@ def test_run_model(self): res = sess.run([outputs[0].name], {inputs[0].name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_async(self): event = threading.Event() @@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None: self.assertEqual(len(err), 0) self.assertEqual(len(res), 1) self.assertEqual(data.get_id(), 123456) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) event.set() so = onnxrt.SessionOptions() @@ -762,7 +762,7 @@ def test_run_model_from_bytes(self): self.assertEqual(output_shape, [3, 2]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -777,7 +777,7 @@ def test_run_model2(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2_contiguous(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -792,10 +792,10 @@ def test_run_model2_contiguous(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xcontiguous = np.ascontiguousarray(x) rescontiguous = sess.run([output_name], {input_name: xcontiguous}) - np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_multiple_threads(self): # Skip this test for a "pure" DML onnxruntime python wheel. @@ -860,14 +860,14 @@ def test_list_as_input(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x.tolist()}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_string_list_as_input(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2)) x_name = sess.get_inputs()[0].name res = sess.run([], {x_name: x.tolist()}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_run_device(self): device = onnxrt.get_device() @@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self): self.assertEqual(output_shape, ["None", 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_boolean_inputs(self): sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers) @@ -920,7 +920,7 @@ def test_boolean_inputs(self): output_expected = np.array([[True, False], [False, False]], dtype=bool) res = sess.run([output_name], {a_name: a, b_name: b}) - np.testing.assert_equal(output_expected, res[0]) + np.testing.assert_equal(res[0], output_expected) def test_string_input1(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -941,7 +941,7 @@ def test_string_input1(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_string_input2(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -962,7 +962,7 @@ def test_string_input2(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_bytes(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -983,7 +983,7 @@ def test_input_bytes(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0].astype("|S8")) + np.testing.assert_equal(res[0].astype("|S8"), x) def test_input_object(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1004,7 +1004,7 @@ def test_input_object(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_void(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1029,7 +1029,7 @@ def test_input_void(self): res = sess.run([output_name], {x_name: x}) expr = np.array([["must", "have"], ["same", "size"]], dtype=object) - np.testing.assert_equal(expr, res[0]) + np.testing.assert_equal(res[0], expr) def test_raise_wrong_num_inputs(self): with self.assertRaises(ValueError) as context: @@ -1164,7 +1164,7 @@ def test_sequence_construct(self): }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_sequence_insert(self): opt = onnxrt.SessionOptions() @@ -1194,7 +1194,7 @@ def test_sequence_insert(self): "input_seq": [], }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_ort_execution_mode(self): opt = onnxrt.SessionOptions() @@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self): input_1 = np.zeros((3, 5)).astype(np.float32) res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1}) output_expected = np.ones((3, 5)).astype(np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Create an alias of SessionOptions instance # We will use this alias to construct another InferenceSession @@ -1969,7 +1969,7 @@ def test_adater_export_read(self): self.assertTrue(value.is_tensor()) self.assertEqual(expected_val.element_type(), value.element_type()) self.assertEqual(expected_val.shape(), value.shape()) - np.testing.assert_allclose(expected_val.numpy(), value.numpy()) + np.testing.assert_allclose(value.numpy(), expected_val.numpy()) def test_run_with_adapter(self): model_path = get_name("lora/two_params_lora_model.onnx") diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py index d66951bd66f3d..a24269a312e9b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py +++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py @@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -146,7 +146,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self): gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57) # copy back to CPU cpu_data = gpu_value.numpy() - np.testing.assert_equal(data, cpu_data) + np.testing.assert_equal(cpu_data, data) gpu_value.update_inplace(data2) # update the fake GPU data cpu_data_2 = gpu_value.numpy() # copy back to CPU - np.testing.assert_equal(data2, cpu_data_2) + np.testing.assert_equal(cpu_data_2, data2) gpu_value = None # Delete OrtValue before unregistering library as the allocator will be destroyed. @@ -336,8 +336,8 @@ def test_copy_tensors(self): del b_device # Verify the contents - np.testing.assert_array_equal(a, a_cpu_copy.numpy()) - np.testing.assert_array_equal(b, b_cpu_copy.numpy()) + np.testing.assert_array_equal(a_cpu_copy.numpy(), a) + np.testing.assert_array_equal(b_cpu_copy.numpy(), b) self.unregister_execution_provider_library(ep_name) diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py index 6ed7dfe59b1f6..416d9b6edecd1 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py @@ -19,7 +19,7 @@ def test_run_model(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self): """ diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py index c245699e211d4..9e3c1acbc923b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py @@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows): for i in range(num_rows): # use np.testing.assert_allclose so we can specify the tolerance np.testing.assert_allclose( - [expected_rows[i][key] for key in sorted_keys], [actual_rows[i][key] for key in sorted_keys], + [expected_rows[i][key] for key in sorted_keys], rtol=1e-05, atol=1e-07, ) @@ -37,7 +37,7 @@ def test_run_model_non_tensor(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = rep.run(x) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_proto(self): name = datasets.get_example("logreg_iris.onnx") @@ -47,7 +47,7 @@ def test_run_model_proto(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { @@ -72,7 +72,7 @@ def test_run_model_proto_api(self): outputs = ort_backend.run_model(model, inputs) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py index 5ab2fe8939f6a..d6c1dd9cff3f3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "CUDAExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_cuda_graph(self): if "TensorrtExecutionProvider" in onnxrt.get_available_providers(): @@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers): # One regular run for the necessary memory allocation and cuda graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, CUDA graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay CUDA graph x_ortvalue.update_inplace( @@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) def test_arena_with_cuda_graph(self): if "CUDAExecutionProvider" in onnxrt.get_available_providers(): @@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self): session.run_with_iobinding(io_binding) output = cuda_graph_helper.get_output("softmaxout_1") - np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py index 033eae1cb4c8d..4a6aa7b63d9c3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "DmlExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_dml_graph(self): if "DmlExecutionProvider" in onnxrt.get_available_providers(): @@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers): # One regular run for the necessary memory allocation and dml graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, DML graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay DML graph x_ortvalue.update_inplace( @@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py index 8b6b029c57752..70b8c0fc0b980 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py @@ -80,7 +80,7 @@ def test_dict_vectorizer(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xwrong = x.copy() xwrong["a"] = 5.6 @@ -96,17 +96,17 @@ def test_dict_vectorizer(self): x = {np.int64(k): np.float32(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int64(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int32(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_label_encoder(self): sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers()) @@ -127,18 +127,18 @@ def test_label_encoder(self): x = np.array([["4"]]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[3]], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Python type x = np.array(["4"], ndmin=2) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = np.array(["4"], ndmin=2, dtype=object) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_mlnet(self): available_providers = onnxrt.get_available_providers() diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py index d5c80a4a1f4ba..034f0288e2508 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py +++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py @@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): """ @@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self): """ @@ -152,7 +152,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_bind_input_only(self): """ diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py index bea110e566fb9..f02f4da4eb0fb 100644 --- a/onnxruntime/test/python/quantization/test_fusions.py +++ b/onnxruntime/test/python/quantization/test_fusions.py @@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e for idx, expected_output in enumerate(orig_results): actual_output = fused_results[idx] np.testing.assert_allclose( - expected_output, actual_output, + expected_output, rtol=rtol, atol=atol, err_msg=f"Fused model output {idx} differs", diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py index 5d70641547eae..20b40fc157c16 100644 --- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py +++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py @@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self): for expected, actual in zip(model_outputs, test_outputs, strict=False): exp = expected.reshape(-1) act = actual.reshape(-1) - np.testing.assert_equal(exp, act) + np.testing.assert_equal(act, exp) def test_create_activation_matching_present(self): float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx") diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py index a8f7591186766..906bf7aab8698 100644 --- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py +++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py @@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self): matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type) quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type) quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type) - np.testing.assert_allclose(quant_value_ref, quant_value) - np.testing.assert_allclose(absmax_ref, absmax) + np.testing.assert_allclose(quant_value, quant_value_ref) + np.testing.assert_allclose(absmax, absmax_ref) if __name__ == "__main__": diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py index 7916d93c3e531..1dedc475c9962 100644 --- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py +++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py @@ -40,7 +40,7 @@ def test_basic_all(self): x = np.arange(2**2).reshape((2,) * 2).astype(np.float32) t = np.arange(8).reshape((2, 4)).astype(np.float32) got = sess.run(None, {"X": x})[0] - np.testing.assert_allclose(t, got, atol=1e-5) + np.testing.assert_allclose(got, t, atol=1e-5) if __name__ == "__main__": diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index f5f6a3ae3bc39..0558d008a2275 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -31,23 +31,12 @@ "current_failing_tests": [ "^test_adagrad", "^test_adagrad_multiple", - "^test_attention_4d_diff_heads_mask4d_padded_kv*", // pending onnx update - "^test_attention_3d_gqa*", // pending onnx update - "^test_attention_3d_gqa_causal", // pending onnx update - "^test_attention_3d_gqa_scaled", // pending onnx update - "^test_attention_3d_gqa_softcap", // pending onnx update - "^test_attention_3d_gqa_with_past_and_present", // pending onnx update - "^test_attention_4d_gqa*", // pending onnx update - "^test_attention_4d_gqa_causal", // pending onnx update - "^test_attention_4d_gqa_scaled", // pending onnx update - "^test_attention_4d_gqa_softcap", // pending onnx update - "^test_attention_4d_gqa_with_past_and_present", // pending onnx update - "^test_attention_*causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*", // pending onnx update - "^test_attention_4d_attn_mask_3d_causal_expanded*", // pending onnx update "^test_attention_4d_fp16*", // precision issue: 1 / 192 mismatched elements "^test_attention_4d_fp16_expanded*", // precision issue: 3 / 192 mismatched elements + "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*", // webgpu mismatched elements 38 / 576 + "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*", // webgpu + "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu + "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen "^test_l2normalization*", // LpNormalization(22) not implemented "^test_l1normalization*", // LpNormalization(22) not implemented "^test_lpnormalization*", // LpNormalization(22) not implemented @@ -123,13 +112,9 @@ "^test_if_opt", "^test_loop16_seq_none", "^test_identity_opt", - // rotary dim should be fixed in onnx==1.19.1 - "^test_rotary_embedding_no_position_ids_rotary_dim", - "^test_rotary_embedding_with_interleaved_rotary_dim", - "^test_rotary_embedding_with_rotary_dim", - "^test_rotary_embedding_3d_input_expanded", - "^test_rotary_embedding_interleaved_expanded", - "^test_rotary_embedding_no_position_ids_interleaved_expanded", + "^test_rotary_embedding_3d_input_expanded", // win cuda fail + "^test_rotary_embedding_interleaved_expanded", // win cuda fail + "^test_rotary_embedding_no_position_ids_interleaved_expanded", // win cuda fail "^test_rotary_embedding_expanded", //webgpu "^test_rotary_embedding_no_position_ids_expanded", //webgpu // Following tests are for opset 16 ops and are not yet implemented in ORT diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py new file mode 100644 index 0000000000000..fa381e5df1094 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272.py @@ -0,0 +1,26 @@ +import onnx +from onnx import TensorProto, helper + +# Create a simple ONNX model with DDS output +input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"]) +output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"]) + +nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node") +transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node") +gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node") + +value_info = [ + helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]), + helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]), +] + +graph = helper.make_graph( + [nonzeros_node, transpose_node, gathernd_node], + "test_graph", + [input], + [output], + value_info=value_info, +) + +model = helper.make_model(graph) +onnx.save(model, "ort_github_issue_26272_dds.onnx") diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx new file mode 100644 index 0000000000000..371f99c537898 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx @@ -0,0 +1,28 @@ + +:“ +( +datanonzeros nonzeros_node"NonZero +1 +nonzeros +nonzeros_ttranspose_node" Transpose +3 +data + +nonzeros_toutput gathernd_node"GatherND +test_graphZ +data + +d1 +d2b +output +  +nzrj +nonzeros + + +nzrj + +nonzeros_t + +nzr +B \ No newline at end of file diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 327caf83c7850..591be538ac873 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1015,6 +1015,9 @@ def generate_build_tree( if path_to_protoc_exe: cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"] + if args.cmake_deps_mirror_dir: + cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"] + if args.fuzz_testing: if not ( args.build_shared_lib @@ -1330,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe cmd_args.extend(["--target", *targets]) build_tool_args = [] - if num_parallel_jobs != 1: + if num_parallel_jobs != 0: if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm: # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests # not maxing out CL_MPCount @@ -1748,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): # Install cpu only version of torch when cuda is not enabled in Linux. extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"] run_subprocess( - [sys.executable, "-m", "pip", "install", "torch", *extra], + [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra], cwd=cwd, dll_path=dll_path, python_path=python_path, diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index c5454903474d1..05d5052067b2e 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None: help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines", ) parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.") + parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.") parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.") parser.add_argument( "--enable_symbolic_shape_infer_tests", diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml index 53b62762319ba..e54216fe4ef4e 100644 --- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml @@ -31,5 +31,5 @@ stages: machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU' extra_build_arg: '' cmake_build_type: Release - cuda_version: 12.2 + cuda_version: 12.8 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 91736752e22d4..086d65c93062b 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -73,12 +73,12 @@ variables: - name: ReleaseVersionSuffix value: '' - name: win_trt_version - value: 12.2 + value: 12.8 - name: win_trt_home value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home - value: $(Agent.TempDirectory)\v12.2 + value: $(Agent.TempDirectory)\v12.8 extends: # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks. # For non-production pipelines, use "Unofficial" as defined below. @@ -142,7 +142,7 @@ extends: - template: stages/nuget-combine-cuda-stage.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} win_trt_home: ${{ variables.win_trt_home }} diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml index 46363c07b3e3e..7e107c33ed8c0 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml @@ -127,7 +127,7 @@ stages: NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_win.yml parameters: @@ -136,7 +136,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' MoreSuffix: '_Windows' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -144,7 +144,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -153,7 +153,7 @@ stages: StageSuffix: 'GPU' MoreSuffix: '_Linux' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' - CudaVersion: 12.2 + CudaVersion: 12.8 @@ -202,7 +202,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -257,7 +257,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index 5535d7b4f264d..d7fc0efbf45ea 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -48,9 +48,9 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml @@ -59,13 +59,13 @@ variables: - name: win_trt_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml index 1ad6f411d9848..5ce6ec278b1e7 100644 --- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: QnnSdk displayName: QNN SDK Version @@ -40,8 +40,8 @@ variables: - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: @@ -178,9 +178,6 @@ extends: inputs: targetType: 'inline' script: | - mkdir -p $(Build.BinariesDirectory)/osx-x64 - Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64 - mkdir -p $(Build.BinariesDirectory)/osx-arm64 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64 @@ -200,12 +197,6 @@ extends: foreach ($dir in $dirs) { Write-Host "Directory: $($dir.FullName)" } - $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64* - if ($osx_x64_archive.Count -eq 0) { - Write-Host "No osx-x64 archive found." - } else { - Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)" - } $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64* if ($osx_arm64_archive.Count -eq 0) { Write-Host "No osx-arm64 archive found." @@ -233,13 +224,10 @@ extends: script: | Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64 - $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName - tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName - $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64" Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64" diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml index d387c07d6dc6e..463c02203e21a 100644 --- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml +++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml @@ -40,7 +40,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -105,7 +105,7 @@ stages: - name: runCodesignValidationInjection value: false - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 timeoutInMinutes: 60 steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index 0410001d77d13..5e6671e3797ce 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,21 +31,21 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} jobs: diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index 89ce3f3c86727..b60ef7576184e 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -58,9 +58,9 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu Context: tools/ci_build/github/linux/docker/ - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: DockerBuildArgs: " - --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04 + --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }} --build-arg BUILD_UID=$( id -u ) " @@ -107,4 +107,4 @@ stages: DisableContribOps: $(DisableContribOps) DisableMlOps: $(DisableMlOps) IsReleaseBuild: $(IsReleaseBuild) - PACKAGENAME: ${{ parameters.NugetPackageName }} \ No newline at end of file + PACKAGENAME: ${{ parameters.NugetPackageName }} diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index deb8b84bf19b8..fdfafd4d9a179 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -2,16 +2,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} stages: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index c2c89686a077e..02b6a6df76611 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,8 +18,8 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 - cuda_version: '12.2' + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 + cuda_version: '12.8' - stage: Republish_Wheels dependsOn: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml index 4c536bad45368..290af4a3e4449 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml @@ -49,4 +49,4 @@ extends: - template: stages/py-gpu-packaging-stage.yml parameters: cmake_build_type: ${{ parameters.cmake_build_type }} - cuda_version: '12.2' + cuda_version: '12.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index 858de4d173484..b53aee639372d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -2,9 +2,9 @@ parameters: - name: CudaVersion displayName: 'CUDA version' type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: machine_pool type: string @@ -44,13 +44,13 @@ jobs: - template: ../../templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} pool: ${{ parameters.machine_pool }} steps: @@ -105,4 +105,4 @@ jobs: inputs: targetType: filePath filePath: tools/ci_build/github/linux/run_python_dockertest.sh - arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2 + arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8 diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml index bca95a4a2fd02..8cbb81ba89c12 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' stages: - stage: Linux_Nodejs_Packaging_x64 @@ -20,14 +20,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index 121e80fca1021..b1e5f541b90e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: buildJava type: boolean - name: buildNodejs @@ -22,7 +22,7 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} @@ -74,14 +74,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self @@ -140,12 +140,12 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index d3d4b8f5b64d5..3c5cf591039e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -19,9 +19,9 @@ parameters: - name: cuda_version type: string displayName: 'CUDA version. Windows Only.' - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: PythonVersions type: object @@ -48,4 +48,4 @@ stages: extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml index 715470eb9f012..ab1fb919af413 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml @@ -22,9 +22,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 stages: - stage: Linux_py_GPU_Wheels_${{ parameters.arch }} @@ -55,7 +55,7 @@ stages: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index e2683c04f21f2..c3957fc8341de 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -20,9 +20,9 @@ parameters: default: '' - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: cmake_build_type type: string @@ -47,7 +47,7 @@ stages: workspace: clean: all pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows templateContext: sdl: @@ -76,7 +76,7 @@ stages: - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} - name: trt_build_flag ${{ if eq(parameters.use_tensorrt, true) }}: @@ -119,7 +119,7 @@ stages: --cmake_generator "$(VSGenerator)" --enable_pybind --enable_onnx_tests - --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40 + --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }} workingDirectory: '$(Build.BinariesDirectory)' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index 681138a5ab3d1..be213337091e8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -7,10 +7,10 @@ parameters: default: false - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 - name: TrtVersion type: string default: '10.9.0.34' @@ -46,11 +46,11 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 96436883fb8b8..d7c940cda30f4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -9,7 +9,7 @@ parameters: default: false - name: PrimaryCUDAVersion type: string - default: '12.2' + default: '12.8' # - name: SecondaryCUDAVersion # type: string # default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 1415586521f30..263f73a9e29b0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -18,9 +18,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 # TODO: Ideally it should fetch information from the build that triggers it - name: cmake_build_type @@ -46,7 +46,7 @@ jobs: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} workspace: clean: all diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 0310735d94b2e..ca698123a04e7 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -78,7 +78,7 @@ parameters: default: '11.8' values: - 11.8 - - 12.2 + - 12.8 - name: SpecificArtifact displayName: Use Specific Artifact @@ -136,7 +136,7 @@ stages: ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}: pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows ${{ else }}: pool: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml index c20f4a2c1bd19..8b320b0ceb4ac 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml @@ -32,10 +32,10 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 stages: - stage: kernelDocumentation diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index c12bb3552920c..08953749f6527 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,16 +31,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} jobs: diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index 2a65e7c26b20b..a277286866e41 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm index 3337af3be6074..5410bd64036ce 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ARG ROCM_VERSION=6.2.3 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu index 0007a4e06f7c0..07ad8e933baf0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 8a84b9b940306..5d98c25b535af 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04 +ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 ARG TRT_VERSION=10.9.0.34-1+cuda12.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 8b2083c2ccfc1..cef2d11780969 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index f5143d5ac9ab9..79d99d08dcc4e 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt index bae6f4cb51816..1b1dadeaf8db2 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.8 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index cfc2ce7079148..72d98206f9205 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index 8401393a661b1..85f4a074e30bf 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2 FROM $BASEIMAGE ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index b923febc1227f..81ba47f397f91 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile index f3341f32a768d..5ad1023bfb5b2 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile @@ -1,5 +1,5 @@ # Use the specified UBI8 base image with GCC 14 -ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1" +ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2" FROM ${BASEIMAGE} ARG BUILD_UID=1000 diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt index 2871f5cab2ea2..dc394ff50f4f9 100644 --- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt @@ -3,7 +3,7 @@ beartype==0.15.0 flatbuffers cerberus h5py -onnx==1.19.0 +onnx==1.19.1 # Python dependencies required for pytorch development astunparse expecttest!=0.2.0 diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 381d42831e715..2d89aece56340 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 4cc94f9148656..c19c0170291e6 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -4,11 +4,11 @@ mypy pytest setuptools==78.1.1 wheel==0.45.1 -onnx==1.19.0 +onnx==1.19.1 argparse sympy==1.14 flatbuffers protobuf==4.25.1 packaging -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt index d48fb66194f2a..3ddce9cc0ec31 100644 --- a/tools/ci_build/github/linux/python/requirements.txt +++ b/tools/ci_build/github/linux/python/requirements.txt @@ -3,12 +3,12 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index 6ab2ab2b7b61f..bb307a20d7f18 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -3,13 +3,13 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe semver diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index f93938e2a9009..f095f58f9920e 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,13 +1,13 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( -set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index ecadab5d3f8a3..115a19b6f3a01 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -1,14 +1,14 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 45e0d970fb541..6110249a9cde6 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,10 +1,10 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64 ) else ( - set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 + set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64 ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index bcd5a434c58e8..21894c2ba003d 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -3,12 +3,13 @@ packaging # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt protobuf==4.25.1 numpy==2.2.6 -torch>=2.6.0 +torch==2.8.0 +torchvision==0.23.0 coloredlogs==15.0 transformers==4.52.1 parameterized>=0.8.1 sentencepiece psutil einops -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10