Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f245422
Fix NPM packaging pipeline (#25244)
fs-eire Jul 2, 2025
e80cd8a
Bump electron from 28.1.4 to 28.3.2 in /js/web (#25241)
dependabot[bot] Jul 2, 2025
0ef1b34
[webgpu] Add 2% tolerance to `MatMulNBits.Float32_8b_AccuracyLevel4` …
daijh Jul 2, 2025
f1c39d7
Refactor LogProviderOptions (#25250)
ashrit-ms Jul 2, 2025
102c3f6
[QNN EP] Add qnn_version to build_and_package_info.py (#25229)
qti-jkilpatrick Jul 2, 2025
4d3949b
Separate TRT and TRT RTX directory usage (#25248)
gedoensmax Jul 2, 2025
d81905e
[QNN EP] Improve QNN EP UDO support for QDQ model (#25194)
chenweng-quic Jul 2, 2025
6707dd4
Use non-CPU device type and id for host accessible memory (#25043)
skottmckay Jul 2, 2025
2657561
[build] do not use separated artifacts for wasm build (#25267)
fs-eire Jul 3, 2025
4fb92bf
[QNN EP] Add Infrastructure to check datatypes (#25257)
quic-hungjuiw Jul 3, 2025
a092468
[build] Fix CUDA build (#25273)
fs-eire Jul 3, 2025
7fc6235
[build] fix build on mac (#25270)
fs-eire Jul 3, 2025
05da42c
Fix TRT-EP build for EP graph tests (#25202)
kevinch-nv Jul 3, 2025
517d684
[EP ABI] Use pre-allocated input buffers for APIs that return arrays.…
adrianlizarraga Jul 3, 2025
c122f0d
platform.cpp: support for POWER9 and POWER10 on FreeBSD (#25186)
pkubaj Jul 3, 2025
af35340
Fix Windows build of cuda 12.8 (#25282)
tianleiwu Jul 3, 2025
2eeed24
Correct WaitNotificationFn function signature (#25237)
skottmckay Jul 3, 2025
763d554
`OrtKeyValuePairs` updates (#25284)
edgchen1 Jul 4, 2025
2f878c6
[QNN EP] Upgrade QNN to 2.36.0 (#25283)
qti-jkilpatrick Jul 4, 2025
3106e3b
Merge branch 'master' into sync_msft_4_7_25
ankitm3k Jul 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 4 additions & 14 deletions .github/workflows/linux-wasm-ci-build-and-test-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,10 @@ jobs:
cp ${{ github.workspace }}/build/wasm_inferencing_jsep/${{ inputs.build_config }}/ort-wasm-simd-threaded.jsep.wasm ${{ github.workspace }}/artifacts/wasm/
cp ${{ github.workspace }}/build/wasm_inferencing_jsep/${{ inputs.build_config }}/ort-wasm-simd-threaded.jsep.mjs ${{ github.workspace }}/artifacts/wasm/
fi

- name: Create WebGPU Artifacts
if: ${{ inputs.skip_publish != true && inputs.build_webgpu == true }}
run: |
mkdir -p ${{ github.workspace }}/artifacts/wasm_webgpu/
cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.wasm ${{ github.workspace }}/artifacts/wasm_webgpu/
cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.mjs ${{ github.workspace }}/artifacts/wasm_webgpu/
if [ -d ${{ github.workspace }}/build/wasm_inferencing_webgpu ]; then
cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.wasm ${{ github.workspace }}/artifacts/wasm/
cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.mjs ${{ github.workspace }}/artifacts/wasm/
fi

- name: Upload WASM artifacts
if: ${{ inputs.skip_publish != true }}
Expand All @@ -122,13 +119,6 @@ jobs:
name: ${{ inputs.build_config }}_wasm
path: ${{ github.workspace }}/artifacts/wasm

- name: Upload WebGPU artifacts
if: ${{ inputs.skip_publish != true && inputs.build_webgpu == true }}
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.build_config }}_wasm_webgpu
path: ${{ github.workspace }}/artifacts/wasm_webgpu

- name: Test (Node.js) (simd + threads)
# onnxruntime_test_all is currently only supported in Debug build because it requires exception, which is disabled in Release build.
if: ${{ inputs.build_config == 'Debug' }}
Expand Down
16 changes: 0 additions & 16 deletions .github/workflows/windows-web-ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,6 @@ jobs:
run: |
copy ${{ github.workspace }}\artifacts_wasm\ort-*.mjs ${{ github.workspace }}\js\web\dist\

- name: Download WebAssembly WebGPU artifacts
uses: actions/download-artifact@v4
with:
name: ${{ inputs.build_config }}_wasm_webgpu
path: ${{ github.workspace }}/artifacts_wasm_webgpu

- name: Binplace dist files (.wasm) for WebGPU
shell: cmd
run: |
copy ${{ github.workspace }}\artifacts_wasm_webgpu\ort-*.wasm ${{ github.workspace }}\js\web\dist\

- name: Binplace dist files (.mjs) for WebGPU
shell: cmd
run: |
copy ${{ github.workspace }}\artifacts_wasm_webgpu\ort-*.mjs ${{ github.workspace }}\js\web\dist\

- name: npm ci for /js/
run: npm ci
working-directory: ${{ github.workspace }}/js
Expand Down
44 changes: 21 additions & 23 deletions cmake/onnxruntime_providers_nv.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ endif ()
add_definitions("-DONNX_ML=1")
add_definitions("-DONNX_NAMESPACE=onnx")
set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
set(TENSORRT_RTX_ROOT ${onnxruntime_TENSORRT_RTX_HOME})
set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
if (WIN32)
Expand All @@ -34,12 +34,12 @@ endif ()
endif()
set(CXX_VERSION_DEFINED TRUE)

find_path(TENSORRT_INCLUDE_DIR NvInfer.h
HINTS ${TENSORRT_ROOT}
find_path(TENSORRT_RTX_INCLUDE_DIR NvInfer.h
HINTS ${TENSORRT_RTX_ROOT}
PATH_SUFFIXES include)


file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
file(READ ${TENSORRT_RTX_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
string(REGEX MATCH "define TRT_MAJOR_RTX * +([0-9]+)" NV_TRT_MAJOR_RTX "${NVINFER_VER_CONTENT}")
string(REGEX REPLACE "define TRT_MAJOR_RTX * +([0-9]+)" "\\1" NV_TRT_MAJOR_RTX "${NV_TRT_MAJOR_RTX}")
string(REGEX MATCH "define TRT_MINOR_RTX * +([0-9]+)" NV_TRT_MINOR_RTX "${NVINFER_VER_CONTENT}")
Expand All @@ -54,37 +54,37 @@ endif ()
endif()

if (WIN32)
set(NVINFER_LIB "tensorrt_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
set(PARSER_LIB "tensorrt_onnxparser_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
set(TRT_RTX_LIB "tensorrt_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
set(RTX_PARSER_LIB "tensorrt_onnxparser_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
endif()

if (NOT NVINFER_LIB)
set(NVINFER_LIB "tensorrt_rtx")
if (NOT TRT_RTX_LIB)
set(TRT_RTX_LIB "tensorrt_rtx")
endif()

if (NOT PARSER_LIB)
set(PARSER_LIB "tensorrt_onnxparser_rtx")
if (NOT RTX_PARSER_LIB)
set(RTX_PARSER_LIB "tensorrt_onnxparser_rtx")
endif()

MESSAGE(STATUS "Looking for ${NVINFER_LIB}")
MESSAGE(STATUS "Looking for ${TRT_RTX_LIB}")

find_library(TENSORRT_LIBRARY_INFER ${NVINFER_LIB}
HINTS ${TENSORRT_ROOT}
find_library(TENSORRT_LIBRARY_INFER ${TRT_RTX_LIB}
HINTS ${TENSORRT_RTX_ROOT}
PATH_SUFFIXES lib lib64 lib/x64)

if (NOT TENSORRT_LIBRARY_INFER)
MESSAGE(STATUS "Can't find ${NVINFER_LIB}")
MESSAGE(STATUS "Can't find ${TRT_RTX_LIB}")
endif()

if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
MESSAGE(STATUS "Looking for ${PARSER_LIB}")
MESSAGE(STATUS "Looking for ${RTX_PARSER_LIB}")

find_library(TENSORRT_LIBRARY_NVONNXPARSER ${PARSER_LIB}
HINTS ${TENSORRT_ROOT}
find_library(TENSORRT_LIBRARY_NVONNXPARSER ${RTX_PARSER_LIB}
HINTS ${TENSORRT_RTX_ROOT}
PATH_SUFFIXES lib lib64 lib/x64)

if (NOT TENSORRT_LIBRARY_NVONNXPARSER)
MESSAGE(STATUS "Can't find ${PARSER_LIB}")
MESSAGE(STATUS "Can't find ${RTX_PARSER_LIB}")
endif()

set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_NVONNXPARSER})
Expand All @@ -104,7 +104,6 @@ endif ()
# The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
# unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
include_directories(${onnx_tensorrt_SOURCE_DIR})
set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
if ( CMAKE_COMPILER_IS_GNUCC )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
Expand All @@ -114,17 +113,16 @@ endif ()
unset(PROTOBUF_LIBRARY)
unset(OLD_CMAKE_CXX_FLAGS)
unset(OLD_CMAKE_CUDA_FLAGS)
set_target_properties(${PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
set_target_properties(${RTX_PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100)
target_compile_options(${PARSER_LIB} PRIVATE /FIio.h /wd4100)
target_compile_options(${RTX_PARSER_LIB} PRIVATE /FIio.h /wd4100)
endif()
# Static libraries are just nvonnxparser_static on all platforms
set(onnxparser_link_libs nvonnxparser_static)
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER})
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
endif()

include_directories(${TENSORRT_INCLUDE_DIR})
# ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
# nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
# See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
Expand Down Expand Up @@ -152,7 +150,7 @@ endif ()
else()
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
endif()
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})

# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
Expand Down
8 changes: 4 additions & 4 deletions cmake/onnxruntime_providers_tensorrt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@
# The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
# unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
include_directories(${onnx_tensorrt_SOURCE_DIR})
set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
if ( CMAKE_COMPILER_IS_GNUCC )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
Expand All @@ -158,7 +157,6 @@
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
endif()

include_directories(${TENSORRT_INCLUDE_DIR})
# ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
# nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
# See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
Expand Down Expand Up @@ -197,9 +195,11 @@
else()
target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
endif()
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_INCLUDE_DIR}
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})

if (NOT onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnx_tensorrt_SOURCE_DIR})
endif()
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")
Expand Down
15 changes: 10 additions & 5 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1118,7 +1118,7 @@ if (NOT IOS)

target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})

if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_link_libraries(onnx_test_runner PRIVATE Python::Python)
endif()
Expand Down Expand Up @@ -1239,7 +1239,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_include_directories(onnxruntime_perf_test PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
${CMAKE_CURRENT_BINARY_DIR})

if (WIN32)
target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
if (NOT DEFINED SYS_PATH_LIB)
Expand Down Expand Up @@ -1345,7 +1345,7 @@ endif()
if (onnxruntime_USE_CUDA)
list(APPEND onnxruntime_shared_lib_test_LIBS)
endif()

if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
Expand Down Expand Up @@ -1379,7 +1379,7 @@ endif()
if (onnxruntime_USE_NV)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
endif()


if (CMAKE_SYSTEM_NAME STREQUAL "Android")
target_sources(onnxruntime_shared_lib_test PRIVATE
Expand Down Expand Up @@ -1436,7 +1436,7 @@ endif()
DEPENDS ${all_dependencies}
)



target_compile_definitions(onnxruntime_test_debug_node_inputs_outputs
PRIVATE DEBUG_NODE_INPUTS_OUTPUTS)
Expand Down Expand Up @@ -1990,6 +1990,11 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
LIBS ${onnxruntime_ep_graph_test_LIBS}
DEPENDS ${all_dependencies}
)
if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# simply ignore them for TensorRT EP build
set_property(TARGET onnxruntime_ep_graph_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
endif()
endif()

include(onnxruntime_fuzz_test.cmake)
2 changes: 1 addition & 1 deletion include/onnxruntime/core/framework/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class Stream;
namespace synchronize {
class Notification;
}
using WaitNotificationFn = std::function<void(Stream&, synchronize::Notification&)>;
using WaitNotificationFn = std::function<void(Stream*, synchronize::Notification&)>;
void* AllocateBufferWithOptions(IAllocator& allocator, size_t size, bool use_reserve, Stream* stream, WaitNotificationFn wait_fn);

template <typename T>
Expand Down
17 changes: 15 additions & 2 deletions include/onnxruntime/core/framework/ortdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#undef INTEL
#endif

// Struct to represent a physical device.
// Struct to represent a combination of physical device and memory type.
// A memory allocation and allocator have a specific OrtDevice associated with them, and this information is used
// to determine when data transfer is required.
struct OrtDevice {
using DeviceType = int8_t;
using MemoryType = int8_t;
Expand Down Expand Up @@ -41,7 +43,13 @@ struct OrtDevice {
QNN_HTP_SHARED = 4,
};

static const MemoryType HOST_ACCESSIBLE = 5; // Device memory that is accessible from host and device.
// HOST_ACCESSIBLE memory is treated as CPU memory.
// When creating an OrtDevice with MemType::HOST_ACCESSIBLE:
// - For memory that is only accessible by a specific device and CPU, use the specific device type and id.
// - When creating an OrtDevice for an EP allocator, you would typically use the same device type and id
// that the EP is registered with (i.e. the OrtDevice passed to the base IExecutionProvider constructor).
// - Otherwise use OrtDevice::CPU.
static const MemoryType HOST_ACCESSIBLE = 5;
};

// PCI vendor ids
Expand Down Expand Up @@ -101,6 +109,11 @@ struct OrtDevice {
return alignment;
}

// CPU or HOST_ACCESSIBLE memory.
bool UsesCpuMemory() const noexcept {
return device_type == CPU || memory_type == MemType::HOST_ACCESSIBLE;
}

std::string ToString() const {
std::ostringstream ostr;
ostr << "Device:["
Expand Down
16 changes: 11 additions & 5 deletions include/onnxruntime/core/framework/stream_handles.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class Notification;
// i.e. different cuda stream on different GPU.
class Stream {
public:
Stream(StreamHandle h, const OrtDevice& d) : handle_(h), device_(d) {}
Stream(StreamHandle h, const OrtDevice& d)
: handle_(h), device_(d) {
}

virtual ~Stream() = default;
virtual std::unique_ptr<synchronize::Notification> CreateNotification(size_t /*num_consumers*/) {
Expand Down Expand Up @@ -168,14 +170,18 @@ class IStreamCommandHandleRegistry {
virtual ~IStreamCommandHandleRegistry() = default;
// Wait is a little special as we need to consider the source stream the notification generated, and the stream we are waiting.
// i.e., for an cuda event what notify the memory copy, it could be wait on a CPU stream, or on another cuda stream.
[[nodiscard]] virtual WaitNotificationFn GetWaitHandle(OrtDevice::DeviceType notification_ower_device_type,
OrtDevice::DeviceType executor_device_type) const = 0;
// Get the stream creation function registered on the given device type.
[[nodiscard]] virtual WaitNotificationFn GetWaitHandle(const OrtDevice& notification_owner_device,
const OrtDevice& executor_device) const = 0;

// Get the stream creation function registered for the given device type.
[[nodiscard]] virtual CreateStreamFn GetCreateStreamFn(OrtDevice::DeviceType execution_device_type) const = 0;
// register a wait methond which will be invoked when we wait a notification (created by 'notification_device_type' device) on a stream at 'device_type' device.

// register a wait method which will be invoked to await a notification that is
// created by 'notification_device_type' device on a stream at 'device_type' device.
virtual void RegisterWaitFn(OrtDevice::DeviceType notification_device_type,
OrtDevice::DeviceType device_type,
WaitNotificationFn fn) = 0;

// register a handle about how to create stream on given device type.
virtual void RegisterCreateStreamFn(OrtDevice::DeviceType device_type, CreateStreamFn f) = 0;

Expand Down
Loading
Loading