intel · ankitm3k · Jul 4, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
@@ -107,13 +107,10 @@ jobs:
             cp ${{ github.workspace }}/build/wasm_inferencing_jsep/${{ inputs.build_config }}/ort-wasm-simd-threaded.jsep.wasm ${{ github.workspace }}/artifacts/wasm/
             cp ${{ github.workspace }}/build/wasm_inferencing_jsep/${{ inputs.build_config }}/ort-wasm-simd-threaded.jsep.mjs ${{ github.workspace }}/artifacts/wasm/
           fi
-
-      - name: Create WebGPU Artifacts
-        if: ${{ inputs.skip_publish != true && inputs.build_webgpu == true }}
-        run: |
-          mkdir -p ${{ github.workspace }}/artifacts/wasm_webgpu/
-          cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.wasm ${{ github.workspace }}/artifacts/wasm_webgpu/
-          cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.mjs ${{ github.workspace }}/artifacts/wasm_webgpu/
+          if [ -d ${{ github.workspace }}/build/wasm_inferencing_webgpu ]; then
+            cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.wasm ${{ github.workspace }}/artifacts/wasm/
+            cp ${{ github.workspace }}/build/wasm_inferencing_webgpu/${{ inputs.build_config }}/ort-wasm-simd-threaded.asyncify.mjs ${{ github.workspace }}/artifacts/wasm/
+          fi
 
       - name: Upload WASM artifacts
         if: ${{ inputs.skip_publish != true }}
@@ -122,13 +119,6 @@ jobs:
           name: ${{ inputs.build_config }}_wasm
           path: ${{ github.workspace }}/artifacts/wasm
 
-      - name: Upload WebGPU artifacts
-        if: ${{ inputs.skip_publish != true && inputs.build_webgpu == true }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ inputs.build_config }}_wasm_webgpu
-          path: ${{ github.workspace }}/artifacts/wasm_webgpu
-
       - name: Test (Node.js) (simd + threads)
         # onnxruntime_test_all is currently only supported in Debug build because it requires exception, which is disabled in Release build.
         if: ${{ inputs.build_config == 'Debug' }}

diff --git a/.github/workflows/windows-web-ci-workflow.yml b/.github/workflows/windows-web-ci-workflow.yml
@@ -83,22 +83,6 @@ jobs:
         run: |
           copy ${{ github.workspace }}\artifacts_wasm\ort-*.mjs ${{ github.workspace }}\js\web\dist\
 
-      - name: Download WebAssembly WebGPU artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.build_config }}_wasm_webgpu
-          path: ${{ github.workspace }}/artifacts_wasm_webgpu
-
-      - name: Binplace dist files (.wasm) for WebGPU
-        shell: cmd
-        run: |
-          copy ${{ github.workspace }}\artifacts_wasm_webgpu\ort-*.wasm ${{ github.workspace }}\js\web\dist\
-
-      - name: Binplace dist files (.mjs) for WebGPU
-        shell: cmd
-        run: |
-          copy ${{ github.workspace }}\artifacts_wasm_webgpu\ort-*.mjs ${{ github.workspace }}\js\web\dist\
-
       - name: npm ci for /js/
         run: npm ci
         working-directory: ${{ github.workspace }}/js

diff --git a/cmake/onnxruntime_providers_nv.cmake b/cmake/onnxruntime_providers_nv.cmake
@@ -17,7 +17,7 @@ endif ()
   add_definitions("-DONNX_ML=1")
   add_definitions("-DONNX_NAMESPACE=onnx")
   set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
-  set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
+  set(TENSORRT_RTX_ROOT ${onnxruntime_TENSORRT_RTX_HOME})
   set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
   set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
   if (WIN32)
@@ -34,12 +34,12 @@ endif ()
   endif()
   set(CXX_VERSION_DEFINED TRUE)
 
-  find_path(TENSORRT_INCLUDE_DIR NvInfer.h
-    HINTS ${TENSORRT_ROOT}
+  find_path(TENSORRT_RTX_INCLUDE_DIR NvInfer.h
+    HINTS ${TENSORRT_RTX_ROOT}
     PATH_SUFFIXES include)
 
 
-  file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
+  file(READ ${TENSORRT_RTX_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
   string(REGEX MATCH "define TRT_MAJOR_RTX * +([0-9]+)" NV_TRT_MAJOR_RTX "${NVINFER_VER_CONTENT}")
   string(REGEX REPLACE "define TRT_MAJOR_RTX * +([0-9]+)" "\\1" NV_TRT_MAJOR_RTX "${NV_TRT_MAJOR_RTX}")
   string(REGEX MATCH "define TRT_MINOR_RTX * +([0-9]+)" NV_TRT_MINOR_RTX "${NVINFER_VER_CONTENT}")
@@ -54,37 +54,37 @@ endif ()
   endif()
 
   if (WIN32)
-    set(NVINFER_LIB "tensorrt_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
-    set(PARSER_LIB "tensorrt_onnxparser_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
+    set(TRT_RTX_LIB "tensorrt_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
+    set(RTX_PARSER_LIB "tensorrt_onnxparser_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
   endif()
 
-  if (NOT NVINFER_LIB)
-     set(NVINFER_LIB "tensorrt_rtx")
+  if (NOT TRT_RTX_LIB)
+     set(TRT_RTX_LIB "tensorrt_rtx")
   endif()
 
-  if (NOT PARSER_LIB)
-     set(PARSER_LIB "tensorrt_onnxparser_rtx")
+  if (NOT RTX_PARSER_LIB)
+     set(RTX_PARSER_LIB "tensorrt_onnxparser_rtx")
   endif()
 
-  MESSAGE(STATUS "Looking for ${NVINFER_LIB}")
+  MESSAGE(STATUS "Looking for ${TRT_RTX_LIB}")
 
-  find_library(TENSORRT_LIBRARY_INFER ${NVINFER_LIB}
-    HINTS ${TENSORRT_ROOT}
+  find_library(TENSORRT_LIBRARY_INFER ${TRT_RTX_LIB}
+    HINTS ${TENSORRT_RTX_ROOT}
     PATH_SUFFIXES lib lib64 lib/x64)
 
   if (NOT TENSORRT_LIBRARY_INFER)
-    MESSAGE(STATUS "Can't find ${NVINFER_LIB}")
+    MESSAGE(STATUS "Can't find ${TRT_RTX_LIB}")
   endif()
 
   if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
-    MESSAGE(STATUS "Looking for ${PARSER_LIB}")
+    MESSAGE(STATUS "Looking for ${RTX_PARSER_LIB}")
 
-    find_library(TENSORRT_LIBRARY_NVONNXPARSER ${PARSER_LIB}
-      HINTS  ${TENSORRT_ROOT}
+    find_library(TENSORRT_LIBRARY_NVONNXPARSER ${RTX_PARSER_LIB}
+      HINTS  ${TENSORRT_RTX_ROOT}
       PATH_SUFFIXES lib lib64 lib/x64)
 
     if (NOT TENSORRT_LIBRARY_NVONNXPARSER)
-      MESSAGE(STATUS "Can't find ${PARSER_LIB}")
+      MESSAGE(STATUS "Can't find ${RTX_PARSER_LIB}")
     endif()
 
     set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_NVONNXPARSER})
@@ -104,7 +104,6 @@ endif ()
     # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
     # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
     onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
-    include_directories(${onnx_tensorrt_SOURCE_DIR})
     set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
     if ( CMAKE_COMPILER_IS_GNUCC )
       set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
@@ -114,17 +113,16 @@ endif ()
       unset(PROTOBUF_LIBRARY)
       unset(OLD_CMAKE_CXX_FLAGS)
       unset(OLD_CMAKE_CUDA_FLAGS)
-      set_target_properties(${PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
+      set_target_properties(${RTX_PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
       target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100)
-      target_compile_options(${PARSER_LIB} PRIVATE /FIio.h /wd4100)
+      target_compile_options(${RTX_PARSER_LIB} PRIVATE /FIio.h /wd4100)
     endif()
     # Static libraries are just nvonnxparser_static on all platforms
     set(onnxparser_link_libs nvonnxparser_static)
     set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER})
     MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
   endif()
 
-  include_directories(${TENSORRT_INCLUDE_DIR})
   # ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
   # nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
   # See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
@@ -152,7 +150,7 @@ endif ()
   else()
     target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   endif()
-  target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
+  target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
 
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found

diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -138,7 +138,6 @@
     # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
     # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
     onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
-    include_directories(${onnx_tensorrt_SOURCE_DIR})
     set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
     if ( CMAKE_COMPILER_IS_GNUCC )
       set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
@@ -158,7 +157,6 @@
     MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
   endif()
 
-  include_directories(${TENSORRT_INCLUDE_DIR})
   # ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
   # nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
   # See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
@@ -197,9 +195,11 @@
   else()
     target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   endif()
-  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
+  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_INCLUDE_DIR}
     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
-
+  if (NOT onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
+        target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnx_tensorrt_SOURCE_DIR})
+  endif()
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
   set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
   set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -1118,7 +1118,7 @@ if (NOT IOS)
 
     target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
     target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
-    
+
     if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_link_libraries(onnx_test_runner PRIVATE Python::Python)
     endif()
@@ -1239,7 +1239,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     target_include_directories(onnxruntime_perf_test PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
           ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
           ${CMAKE_CURRENT_BINARY_DIR})
-    
+
     if (WIN32)
       target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
       if (NOT DEFINED SYS_PATH_LIB)
@@ -1345,7 +1345,7 @@ endif()
     if (onnxruntime_USE_CUDA)
       list(APPEND onnxruntime_shared_lib_test_LIBS)
     endif()
-    
+
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
@@ -1379,7 +1379,7 @@ endif()
     if (onnxruntime_USE_NV)
       target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
     endif()
-    
+
 
     if (CMAKE_SYSTEM_NAME STREQUAL "Android")
       target_sources(onnxruntime_shared_lib_test PRIVATE
@@ -1436,7 +1436,7 @@ endif()
       DEPENDS ${all_dependencies}
     )
 
-    
+
 
     target_compile_definitions(onnxruntime_test_debug_node_inputs_outputs
       PRIVATE DEBUG_NODE_INPUTS_OUTPUTS)
@@ -1990,6 +1990,11 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
           LIBS ${onnxruntime_ep_graph_test_LIBS}
           DEPENDS ${all_dependencies}
   )
+  if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
+    # The test_main.cc includes NvInfer.h where it has many deprecated declarations
+    # simply ignore them for TensorRT EP build
+    set_property(TARGET onnxruntime_ep_graph_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
+  endif()
 endif()
 
 include(onnxruntime_fuzz_test.cmake)
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -86,7 +86,7 @@ class Stream;
 namespace synchronize {
 class Notification;
 }
-using WaitNotificationFn = std::function<void(Stream&, synchronize::Notification&)>;
+using WaitNotificationFn = std::function<void(Stream*, synchronize::Notification&)>;
 void* AllocateBufferWithOptions(IAllocator& allocator, size_t size, bool use_reserve, Stream* stream, WaitNotificationFn wait_fn);
 
 template <typename T>

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
@@ -13,7 +13,9 @@
 #undef INTEL
 #endif
 
-// Struct to represent a physical device.
+// Struct to represent a combination of physical device and memory type.
+// A memory allocation and allocator have a specific OrtDevice associated with them, and this information is used
+// to determine when data transfer is required.
 struct OrtDevice {
   using DeviceType = int8_t;
   using MemoryType = int8_t;
@@ -41,7 +43,13 @@ struct OrtDevice {
       QNN_HTP_SHARED = 4,
     };
 
-    static const MemoryType HOST_ACCESSIBLE = 5;  // Device memory that is accessible from host and device.
+    // HOST_ACCESSIBLE memory is treated as CPU memory.
+    // When creating an OrtDevice with MemType::HOST_ACCESSIBLE:
+    //   - For memory that is only accessible by a specific device and CPU, use the specific device type and id.
+    //   - When creating an OrtDevice for an EP allocator, you would typically use the same device type and id
+    //     that the EP is registered with (i.e. the OrtDevice passed to the base IExecutionProvider constructor).
+    //   - Otherwise use OrtDevice::CPU.
+    static const MemoryType HOST_ACCESSIBLE = 5;
   };
 
   // PCI vendor ids
@@ -101,6 +109,11 @@ struct OrtDevice {
     return alignment;
   }
 
+  // CPU or HOST_ACCESSIBLE memory.
+  bool UsesCpuMemory() const noexcept {
+    return device_type == CPU || memory_type == MemType::HOST_ACCESSIBLE;
+  }
+
   std::string ToString() const {
     std::ostringstream ostr;
     ostr << "Device:["

diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
@@ -26,7 +26,9 @@ class Notification;
 // i.e. different cuda stream on different GPU.
 class Stream {
  public:
-  Stream(StreamHandle h, const OrtDevice& d) : handle_(h), device_(d) {}
+  Stream(StreamHandle h, const OrtDevice& d)
+      : handle_(h), device_(d) {
+  }
 
   virtual ~Stream() = default;
   virtual std::unique_ptr<synchronize::Notification> CreateNotification(size_t /*num_consumers*/) {
@@ -168,14 +170,18 @@ class IStreamCommandHandleRegistry {
   virtual ~IStreamCommandHandleRegistry() = default;
   // Wait is a little special as we need to consider the source stream the notification generated, and the stream we are waiting.
   // i.e., for an cuda event what notify the memory copy, it could be wait on a CPU stream, or on another cuda stream.
-  [[nodiscard]] virtual WaitNotificationFn GetWaitHandle(OrtDevice::DeviceType notification_ower_device_type,
-                                                         OrtDevice::DeviceType executor_device_type) const = 0;
-  // Get the stream creation function registered on the given device type.
+  [[nodiscard]] virtual WaitNotificationFn GetWaitHandle(const OrtDevice& notification_owner_device,
+                                                         const OrtDevice& executor_device) const = 0;
+
+  // Get the stream creation function registered for the given device type.
   [[nodiscard]] virtual CreateStreamFn GetCreateStreamFn(OrtDevice::DeviceType execution_device_type) const = 0;
-  // register a wait methond which will be invoked when we wait a notification (created by 'notification_device_type' device) on a stream at 'device_type' device.
+
+  // register a wait method which will be invoked to await a notification that is
+  // created by 'notification_device_type' device on a stream at 'device_type' device.
   virtual void RegisterWaitFn(OrtDevice::DeviceType notification_device_type,
                               OrtDevice::DeviceType device_type,
                               WaitNotificationFn fn) = 0;
+
   // register a handle about how to create stream on given device type.
   virtual void RegisterCreateStreamFn(OrtDevice::DeviceType device_type, CreateStreamFn f) = 0;